In [1]:
# @title Necessary imports and globals.
import numpy as np
import os
import dopamine
from dopamine.agents.dqn import dqn_agent
from dopamine.discrete_domains import run_experiment, atari_lib
from dopamine.colab import utils as colab_utils
from absl import flags
import gin.tf

BASE_PATH = 'running-data'  # @param
GAME = 'BattleZone'  # @param

In [2]:
experimental_data = colab_utils.load_baselines('./baselines-data/')

In [3]:
LOG_PATH = os.path.join(BASE_PATH, 'prioritized_srdqn', GAME)

In [7]:
# @title Create the DQN with prioritized replay
from dopamine.replay_memory import prioritized_replay_buffer
import tensorflow as tf

class PrioritizedSRDQNAgent(dqn_agent.DQNAgent):
  def __init__(self, sess, num_actions):
    """This maintains all the DQN default argument values."""
    super().__init__(sess, num_actions, tf_device='/cpu:*')
    self._replay_scheme = 'prioritized'
    
    with tf.device('/gpu:0'):
        self._build_networks()

        self._train_op = self._build_train_op()
        self._sr_train_op = self._build_sr_train_op()
        self._sync_qt_ops = self._build_sync_op()
        
    print('finished constructing')
    self.online_convnet.summary()
    self.sr_convnet.summary()
        

  def _build_networks(self):
    """Builds the Q-value network computations needed for acting and training.

    These are:
      self.online_convnet: For computing the current state's Q-values.
      self.target_convnet: For computing the next state's target Q-values.
      self.sr_convnet: For computing the sr for state-action pair
      self._net_outputs: The actual Q-values.
      self._q_argmax: The action maximizing the current state's Q-values.
      self._replay_net_outputs: The replayed states' Q-values.
      self._replay_next_target_net_outputs: The replayed next states' target
        Q-values (see Mnih et al., 2015 for details).
    """

    # _network_template instantiates the model and returns the network object.
    # The network object can be used to generate different outputs in the graph.
    # At each call to the network, the parameters will be reused.
    self.online_convnet = self._create_network(name='Online')
    self.target_convnet = self._create_network(name='Target')
    self._net_outputs = self.online_convnet(self.state_ph)
    # TODO(bellemare): Ties should be broken. They are unlikely to happen when
    # using a deep network, but may affect performance with a linear
    # approximation scheme.
    self._q_argmax = tf.argmax(self._net_outputs.q_values, axis=1)[0]
    self._replay_net_outputs = self.online_convnet(self._replay.transition['state'])
    self._replay_next_target_net_outputs = self.target_convnet(
        self._replay.transition['next_state'])
    
    self._q_argmax_sr = tf.argmax(self._net_outputs.q_values, axis=1)
    self.sr_convnet = atari_lib.SRNetwork(self.num_actions, atari_lib.NATURE_DQN_STACK_SIZE)
    # sr for states sampled
    self._sr_net_outputs = self.sr_convnet(self._replay.transition['state'])
    # sr for next_states sampled
    self._sr_net_outputs_next = self.sr_convnet(self._replay.transition['next_state'])
    # sr for current state and action
    self._sr_net_curr_state = self.sr_convnet(self.state_ph)

  def _build_replay_buffer(self, use_staging):
    return prioritized_replay_buffer.WrappedPrioritizedReplayBuffer(
        observation_shape=self.observation_shape,
        stack_size=self.stack_size,
        use_staging=use_staging,
        update_horizon=self.update_horizon,
        gamma=self.gamma,
        observation_dtype=self.observation_dtype.as_numpy_dtype)
  
  def _build_sr_train_op(self):
    feature = self._sr_net_outputs.feature
    decoded_state = self._sr_net_outputs.decoded_state
    
    loss_ae = tf.compat.v1.losses.huber_loss(
        self._replay.states, decoded_state, reduction=tf.losses.Reduction.NONE
    )
    srs = self._sr_net_outputs.sr_values
    indices = tf.transpose(tf.stack([self._replay.actions, tf.constant([i for i in range(32)])]))
    srs = tf.gather_nd(srs, indices)
    
    srs_next = self._sr_net_outputs_next.sr_values
    indices_next = tf.transpose(tf.stack([self._replay.next_actions, tf.constant([i for i in range(32)])]))
    srs_next = tf.gather_nd(srs_next, indices_next)
    
    assert feature.shape == srs_next.shape
    assert srs.shape == feature.shape
    
    loss_sr = tf.compat.v1.losses.mean_squared_error(
        srs, feature + self.gamma * srs_next
    )
    loss = loss_ae + loss_sr
    return self.optimizer.minimize(tf.reduce_mean(loss))

  def _build_train_op(self):
    """Builds a training op.
    Returns:
      train_op: An op performing one step of training from replay data.
    """
    replay_action_one_hot = tf.one_hot(
        self._replay.actions, self.num_actions, 1., 0., name='action_one_hot')
    replay_chosen_q = tf.reduce_sum(
        self._replay_net_outputs.q_values * replay_action_one_hot,
        axis=1,
        name='replay_chosen_q')
    
    # output from the SR network
    # note that the back prop of the q-loss should not take into account
    # the graph of the need term.
    curr_action = tf.stop_gradient(self._q_argmax_sr)
    sample_features = self._sr_net_outputs.feature
    curr_sr = self._sr_net_curr_state.sr_values    
    curr_sr = tf.gather_nd(curr_sr, curr_action)
    need = tf.stop_gradient(
        tf.tensordot(curr_sr, sample_features, axes=[[1], [1]])[0]
    )
    

    target = tf.stop_gradient(self._build_target_q_op())
    loss = tf.compat.v1.losses.huber_loss(
        target, replay_chosen_q, reduction=tf.losses.Reduction.NONE)
    # The original prioritized experience replay uses a linear exponent
    # schedule 0.4 -> 1.0. Comparing the schedule to a fixed exponent of 0.5
    # on 5 games (Asterix, Pong, Q*Bert, Seaquest, Space Invaders) suggested
    # a fixed exponent actually performs better, except on Pong.
    probs = self._replay.transition['sampling_probabilities']
    loss_weights = 1.0 / tf.sqrt(probs + 1e-10)
    loss_weights /= tf.reduce_max(loss_weights)

    # Rainbow and prioritized replay are parametrized by an exponent alpha,
    # but in both cases it is set to 0.5 - for simplicity's sake we leave it
    # as is here, using the more direct tf.sqrt(). Taking the square root
    # "makes sense", as we are dealing with a squared loss.
    # Add a small nonzero value to the loss to avoid 0 priority items. While
    # technically this may be okay, setting all items to 0 priority will cause
    # troubles, and also result in 1.0 / 0.0 = NaN correction terms.
    update_priorities_op = self._replay.tf_set_priority(
        self._replay.indices, tf.sqrt(loss + 1e-10))

    # Weight the loss by the inverse priorities.
#     loss = loss_weights * loss * need
    loss = loss_weights * loss
    
    assert need.shape == loss.shape
    loss_need = need * loss
    
    with tf.control_dependencies([update_priorities_op]):
      if self.summary_writer is not None:
        with tf.compat.v1.variable_scope('Losses'):
          tf.compat.v1.summary.scalar('HuberLoss', tf.reduce_mean(loss_need))
      return self.optimizer.minimize(tf.reduce_mean(loss_need))

  def _store_transition(self,
                        last_observation,
                        action,
                        reward,
                        is_terminal,
                        priority=None):
    priority = self._replay.memory.sum_tree.max_recorded_priority
    if not self.eval_mode:
      self._replay.add(last_observation, action, reward, is_terminal, priority)
    
  def _record_observation(self, observation):
    """Records an observation and update state.

    Extracts a frame from the observation vector and overwrites the oldest
    frame in the state buffer.

    Args:
      observation: numpy array, an observation from the environment.
    """
    # Set current observation. We do the reshaping to handle environments
    # without frame stacking.
    self._observation = np.reshape(observation, self.observation_shape)
    # Swap out the oldest frame with the current frame.
    self.state = np.roll(self.state, -1, axis=-1)
    self.state[0, ..., -1] = self._observation
    
  def _train_step(self):
    """Runs a single training step.

    Runs a training op if both:
      (1) A minimum number of frames have been added to the replay buffer.
      (2) `training_steps` is a multiple of `update_period`.

    Also, syncs weights from online to target network if training steps is a
    multiple of target update period.
    """
    # Run a train op at the rate of self.update_period if enough training steps
    # have been run. This matches the Nature DQN behaviour.
    if self._replay.memory.add_count > self.min_replay_history:
      if self.training_steps % self.update_period == 0:
        self._sess.run(self._train_op, {self.state_ph: self.state})
        print(self._replay.states.shape)
        print(self._replay.transition['state'].shape)
        self._sess.run(self._sr_train_op)
        if (self.summary_writer is not None and
            self.training_steps > 0 and
            self.training_steps % self.summary_writing_frequency == 0):
          summary = self._sess.run(self._merged_summaries)
          self.summary_writer.add_summary(summary, self.training_steps)

      if self.training_steps % self.target_update_period == 0:
        self._sess.run(self._sync_qt_ops)

    self.training_steps += 1
    
def create_prioritized_srdqn_agent(sess, environment, summary_writer=None):
  """The Runner class will expect a function of this type to create an agent."""
  return PrioritizedSRDQNAgent(sess, num_actions=environment.action_space.n)

prioritized_srdqn_config = """
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.agents.dqn.dqn_agent
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables

DQNAgent.gamma = 0.99
DQNAgent.update_horizon = 1
DQNAgent.min_replay_history = 20000  # agent steps
DQNAgent.update_period = 4
DQNAgent.target_update_period = 8000  # agent steps
DQNAgent.epsilon_train = 0.01
DQNAgent.epsilon_eval = 0.001
DQNAgent.epsilon_decay_period = 250000  # agent steps
DQNAgent.tf_device = '/gpu:0'  # use '/cpu:*' for non-GPU version
DQNAgent.optimizer = @tf.train.RMSPropOptimizer()

tf.train.RMSPropOptimizer.learning_rate = 0.00025
tf.train.RMSPropOptimizer.decay = 0.95
tf.train.RMSPropOptimizer.momentum = 0.0
tf.train.RMSPropOptimizer.epsilon = 0.00001
tf.train.RMSPropOptimizer.centered = True

atari_lib.create_atari_environment.game_name = '{}'
# Sticky actions with probability 0.25, as suggested by (Machado et al., 2017).
atari_lib.create_atari_environment.sticky_actions = True
create_agent.agent_name = 'dqn'
Runner.num_iterations = 200
Runner.training_steps = 250000  # agent steps
Runner.evaluation_steps = 125000  # agent steps
Runner.max_steps_per_episode = 27000  # agent steps

WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32
""".format(GAME)
gin.parse_config(prioritized_srdqn_config, skip_unknown=False)

# Create the runner class with this agent. We use very small numbers of steps
# to terminate quickly, as this is mostly meant for demonstrating how one can
# use the framework.
prioritized_srdqn_runner = run_experiment.TrainRunner(LOG_PATH, create_prioritized_srdqn_agent)

finished constructing
Model: "Online"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv (Conv2D)                multiple                  8224      
_________________________________________________________________
Conv (Conv2D)                multiple                  32832     
_________________________________________________________________
Conv (Conv2D)                multiple                  36928     
_________________________________________________________________
flatten_15 (Flatten)         multiple                  0         
_________________________________________________________________
fully_connected (Dense)      multiple                  3965440   
_________________________________________________________________
fully_connected (Dense)      multiple                  9234      
Total params: 4,052,658
Trainable params: 4,052,658
Non-trainable params: 0
____________________________

ResourceExhaustedError: OOM when allocating tensor of shape [451584,512] and type float
	 [[node sr_network_3/fully_connected/kernel/RMSProp/Initializer/ones (defined at C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py:1748) ]]

Original stack trace for 'sr_network_3/fully_connected/kernel/RMSProp/Initializer/ones':
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\traitlets\config\application.py", line 664, in launch_instance
    app.start()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
    self.io_loop.start()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\platform\asyncio.py", line 149, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\asyncio\base_events.py", line 442, in run_forever
    self._run_once()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\asyncio\base_events.py", line 1462, in _run_once
    handle._run()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
    ret = callback()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 787, in inner
    self.run()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\kernelbase.py", line 545, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\interactiveshell.py", line 2867, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\interactiveshell.py", line 2895, in _run_cell
    return runner(coro)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\interactiveshell.py", line 3072, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\interactiveshell.py", line 3263, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-dc351bfa4e8e>", line 243, in <module>
    prioritized_srdqn_runner = run_experiment.TrainRunner(LOG_PATH, create_prioritized_srdqn_agent)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\gin\config.py", line 1046, in gin_wrapper
    return fn(*new_args, **new_kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\dopamine\discrete_domains\run_experiment.py", line 553, in __init__
    create_environment_fn)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\gin\config.py", line 1046, in gin_wrapper
    return fn(*new_args, **new_kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\dopamine\discrete_domains\run_experiment.py", line 219, in __init__
    summary_writer=self._summary_writer)
  File "<ipython-input-6-dc351bfa4e8e>", line 200, in create_prioritized_srdqn_agent
    return PrioritizedSRDQNAgent(sess, num_actions=environment.action_space.n)
  File "<ipython-input-6-dc351bfa4e8e>", line 15, in __init__
    self._sr_train_op = self._build_sr_train_op()
  File "<ipython-input-6-dc351bfa4e8e>", line 90, in _build_sr_train_op
    return self.optimizer.minimize(tf.reduce_mean(loss))
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\training\optimizer.py", line 413, in minimize
    name=name)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\training\optimizer.py", line 597, in apply_gradients
    self._create_slots(var_list)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\training\rmsprop.py", line 124, in _create_slots
    self._name)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\training\optimizer.py", line 1135, in _get_or_make_slot_with_initializer
    var, initializer, shape, dtype, op_name)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\training\slot_creator.py", line 164, in create_slot_with_initializer
    dtype)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\training\slot_creator.py", line 74, in _create_slot_var
    validate_shape=validate_shape)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variable_scope.py", line 1500, in get_variable
    aggregation=aggregation)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variable_scope.py", line 1243, in get_variable
    aggregation=aggregation)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variable_scope.py", line 567, in get_variable
    aggregation=aggregation)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variable_scope.py", line 519, in _true_getter
    aggregation=aggregation)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variable_scope.py", line 933, in _get_single_variable
    aggregation=aggregation)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variables.py", line 258, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variables.py", line 219, in _variable_v1_call
    shape=shape)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variables.py", line 197, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variable_scope.py", line 2503, in default_variable_creator
    shape=shape)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variables.py", line 262, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\resource_variable_ops.py", line 1406, in __init__
    distribute_strategy=distribute_strategy)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\resource_variable_ops.py", line 1537, in _init_from_args
    initial_value() if init_from_fn else initial_value,
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\variable_scope.py", line 905, in <lambda>
    partition_info=partition_info)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\init_ops.py", line 134, in __call__
    return array_ops.ones(shape, dtype)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 2572, in ones
    output = fill(shape, constant(one, dtype=dtype), name=name)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 171, in fill
    result = gen_array_ops.fill(dims, value, name=name)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\gen_array_ops.py", line 3602, in fill
    "Fill", dims=dims, value=value, name=name)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

  In call to configurable 'Runner' (<class 'dopamine.discrete_domains.run_experiment.Runner'>)
  In call to configurable 'TrainRunner' (<class 'dopamine.discrete_domains.run_experiment.TrainRunner'>)

In [5]:
# @title Train MyRandomDQNAgent.
print('Will train agent, please be patient, may be a while...')
prioritized_srdqn_runner.run_experiment()
print('Done training!')

Will train agent, please be patient, may be a while...
(32, 84, 84, 4) 19922 Episode length: 1450 Return: 2000.0
(32, 84, 84, 4)


ResourceExhaustedError: OOM when allocating tensor with shape[451584,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node gradients_2/sr_network_1_1/fully_connected/MatMul_grad/MatMul_1 (defined at C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py:1748) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Original stack trace for 'gradients_2/sr_network_1_1/fully_connected/MatMul_grad/MatMul_1':
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\traitlets\config\application.py", line 664, in launch_instance
    app.start()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
    self.io_loop.start()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\platform\asyncio.py", line 149, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\asyncio\base_events.py", line 442, in run_forever
    self._run_once()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\asyncio\base_events.py", line 1462, in _run_once
    handle._run()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
    ret = callback()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 787, in inner
    self.run()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\kernelbase.py", line 381, in dispatch_queue
    yield self.process_one()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 225, in wrapper
    runner = Runner(result, future, yielded)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 714, in __init__
    self.run()
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\kernelbase.py", line 545, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\interactiveshell.py", line 2867, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\interactiveshell.py", line 2895, in _run_cell
    return runner(coro)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\interactiveshell.py", line 3072, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\interactiveshell.py", line 3263, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\IPython\core\interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-e6597709383f>", line 242, in <module>
    prioritized_srdqn_runner = run_experiment.TrainRunner(LOG_PATH, create_prioritized_srdqn_agent)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\gin\config.py", line 1046, in gin_wrapper
    return fn(*new_args, **new_kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\dopamine\discrete_domains\run_experiment.py", line 553, in __init__
    create_environment_fn)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\gin\config.py", line 1046, in gin_wrapper
    return fn(*new_args, **new_kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\dopamine\discrete_domains\run_experiment.py", line 219, in __init__
    summary_writer=self._summary_writer)
  File "<ipython-input-4-e6597709383f>", line 199, in create_prioritized_srdqn_agent
    return PrioritizedSRDQNAgent(sess, num_actions=environment.action_space.n)
  File "<ipython-input-4-e6597709383f>", line 15, in __init__
    self._sr_train_op = self._build_sr_train_op()
  File "<ipython-input-4-e6597709383f>", line 89, in _build_sr_train_op
    return self.optimizer.minimize(tf.reduce_mean(loss))
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\training\optimizer.py", line 403, in minimize
    grad_loss=grad_loss)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\training\optimizer.py", line 512, in compute_gradients
    colocate_gradients_with_ops=colocate_gradients_with_ops)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\gradients_impl.py", line 158, in gradients
    unconnected_gradients)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\gradients_util.py", line 679, in _GradientsHelper
    lambda: grad_fn(op, *out_grads))
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\gradients_util.py", line 350, in _MaybeCompile
    return grad_fn()  # Exit early
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\gradients_util.py", line 679, in <lambda>
    lambda: grad_fn(op, *out_grads))
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\math_grad.py", line 1586, in _MatMulGrad
    grad_b = gen_math_ops.mat_mul(a, grad, transpose_a=True)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\gen_math_ops.py", line 6136, in mat_mul
    name=name)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

...which was originally created as op 'sr_network_1_1/fully_connected/MatMul', defined at:
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
[elided 35 identical lines from previous traceback]
  File "<ipython-input-4-e6597709383f>", line 199, in create_prioritized_srdqn_agent
    return PrioritizedSRDQNAgent(sess, num_actions=environment.action_space.n)
  File "<ipython-input-4-e6597709383f>", line 12, in __init__
    self._build_networks()
  File "<ipython-input-4-e6597709383f>", line 52, in _build_networks
    self._sr_net_outputs = self.sr_convnet(self._replay.transition['state'])
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py", line 854, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\dopamine\discrete_domains\atari_lib.py", line 258, in call
    phi = self.dense_phi(phi)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py", line 854, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\keras\layers\core.py", line 1050, in call
    outputs = gen_math_ops.mat_mul(inputs, self.kernel)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\ops\gen_math_ops.py", line 6136, in mat_mul
    name=name)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "C:\Users\apple\anaconda3\envs\dopamine-need\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()


In [48]:
t1 = tf.constant([0]) 

In [49]:
t2 = tf.constant([[[-100, -1], [-1, -1], [-1, -1], [-1, -1]], [[-1, -1], [-1, -1], [-1, -1000], [-1, -1]]])

In [50]:
t3 = tf.transpose(t2)
# t3 = tf.gather_nd(t2, [[1, 0], [1, 1], [1,2]])
tf.keras.backend.get_value(t3)

array([[[ -100,    -1],
        [   -1,    -1],
        [   -1,    -1],
        [   -1,    -1]],

       [[   -1,    -1],
        [   -1,    -1],
        [   -1, -1000],
        [   -1,    -1]]])

In [51]:
tf.keras.backend.get_value(tf.gather_nd(t2, t1))

array([[-100,   -1],
       [  -1,   -1],
       [  -1,   -1],
       [  -1,   -1]])

In [21]:
tf.one_hot([1, 2, 0], 3)

<tf.Tensor 'one_hot_3:0' shape=(3, 3) dtype=float32>

In [35]:
all = tf.constant([i for i in range(32)])

In [36]:
ind = tf.constant([0 for i in range(32)])

In [37]:
tf.keras.backend.get_value(tf.transpose(tf.stack([all, ind])))

array([[ 0,  0],
       [ 1,  0],
       [ 2,  0],
       [ 3,  0],
       [ 4,  0],
       [ 5,  0],
       [ 6,  0],
       [ 7,  0],
       [ 8,  0],
       [ 9,  0],
       [10,  0],
       [11,  0],
       [12,  0],
       [13,  0],
       [14,  0],
       [15,  0],
       [16,  0],
       [17,  0],
       [18,  0],
       [19,  0],
       [20,  0],
       [21,  0],
       [22,  0],
       [23,  0],
       [24,  0],
       [25,  0],
       [26,  0],
       [27,  0],
       [28,  0],
       [29,  0],
       [30,  0],
       [31,  0]])