## Implementation of Prioritized Experience Replay

Implementing [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) with double Q learning in TensorFlow.

In [1]:
import math
import random
import numpy as np
from utils import BinaryHeap
import gym
import cv2
import os
import tensorflow as tf
from tqdm import tqdm

## Configuration parameters

Configuration parameteres as presented in the original papers: [DQN](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html) and [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952).

In [2]:
conf_parameters = {
    'max_steps': 50000000,
    'test_step': 50000,
    # Input size
    'screen_width': 84,
    'screen_height': 84,
    'history_length': 4,
    
    'memory_size': 1000000,       # Replay memory size
    'batch_size': 32,             # Number of training cases ove which SGD update is computed
    'gamma': 0.99,                # Discount factor
    'learning_rate': 0.00025 / 4, # Learning rate
    'learn_start': 50000,         # Replay start size
    'random_start': 30,           # Maximum number of 'do nothing' actions at the start of an episode
    
    # Exploration parameters
    'ep_min': 0.1,                 # Final exploration
    'ep_start': 1.0,               # Initial exploration
    'ep_end_time': 1000000,        # Final exploration frame

    'target_q_update_step': 10000, # Target network update frequency

    # Train batch
    'train_frequency': 4,          # Update frequency
    
    # Clip rewards
    'min_reward': -1.0,
    'max_reward': 1.0,

    # How many times should the same action be taken
    'action_repeat': 4,
    
    # Prioritized experience replay parameters
    'alpha': 0.7,
    'beta_init': 0.5,
    'part_number': 100,             # Arbitrary parameter to precompute the sampling distributions
    
    # Whether or not to render the environment 
    'display': False,
    
    'checkpoint_dir': 'Models/Prio_Exp/',
    'log_dir': 'Logs/Prio_Exp/',
    'log_performance_file': 'Logs/Prio_Exp.txt'
}


if not os.path.exists(conf_parameters['checkpoint_dir']):
    os.makedirs(conf_parameters['checkpoint_dir'])
    
if not os.path.exists(conf_parameters['log_dir']):
    os.makedirs(conf_parameters['log_dir'])

## Load the game environment

Wrapper class arund the gym environment. 

In [3]:
class GymEnvironment():
    def __init__(self,name,conf):
        self.env = gym.make(name)            # Initialize Gym environment
        self._screen = None
        self.screen_width = conf['screen_width']
        self.screen_height = conf['screen_height']
        self.random_start = conf['random_start']
        self.action_repeat = conf['action_repeat']
        
    def execute_action(self,action,is_training=True):
        # This function execute the selected action for 'action_repeat' number of times and returns the cumulative reward
        # and final state
        cum_reward = 0
        start_lives = self.env.ale.lives()
        
        for _ in xrange(self.action_repeat):
            self._screen, reward, terminal, _ = self.env.step(action)
            cum_reward += reward
            
            if is_training and start_lives > self.env.ale.lives():
                cum_reward -= 1
                terminal = True
                
            if terminal:
                break
                
        reward = cum_reward
        self.render()
        return self.screen, reward, terminal
    
    @property
    def action_size(self):
        return self.env.action_space.n        # Number of available actions

    @property
    def screen(self):     # Method to resize the screen provided by gym to the desired values
        return cv2.resize(cv2.cvtColor(self._screen,cv2.COLOR_RGB2GRAY)/255.,(self.screen_width,self.screen_height))
    
    def new_game(self):
        if self.env.ale.lives() == 0:
            self._screen = self.env.reset()
        
        self._screen, reward, terminal, _ = self.env.step(0)
        self.render()
        return self.screen, 0, 0, terminal
    
    def render(self):    # Renders the environment only if display == True
        if display:
            self.env.render()
    
    def new_random_game(self):  # Starts a random new game after doing 'nothing' for a random number of frames
        _,_,_,terminal = self.new_game()
        for _ in xrange(random.randint(0,self.random_start-1)):
            self._screen, reward, terminal, _ = self.env.step(0)
        
        self.render()
        return self.screen, 0, 0, terminal

## History
This class will allow us to stack the last K screens to use them as the input to the network (history of states).

In [4]:
class History:
    def __init__(self, conf):
        self.history = np.zeros([conf['history_length'],conf['screen_height'],conf['screen_width']],dtype=np.float32)
        
    def add(self, screen):
        self.history[:-1] = self.history[1:]
        self.history[-1] = screen
        
    def get(self):
        return np.transpose(self.history,(1,2,0))

## Prioritized Experience Replay

Implementing the rank-based approach based on contribution from [Damcy](https://github.com/Damcy/prioritized-experience-replay). The class employes a [BinaryHeap](https://github.com/camigord/OpenAI_ReinforcementLearning/blob/master/Prioritized%20Exp%20Replay/Binary_heap.ipynb) to sort the experiences according to their priority. Sampling is performed as described in the [original](https://arxiv.org/abs/1511.05952) paper.

In [5]:
class Rank_Priority_Replay():
    def __init__(self,conf):
        self.memory_size = conf['memory_size']
        # These are the arrays where we will store the experiences
        self._experience = {}
        
        self.alpha = conf['alpha']
        self.beta_init = conf['beta_init']
        self.batch_size = conf['batch_size']
        self.partition_num = conf['part_number']    # Split total size to N segments
        self.learn_start = conf['learn_start']    
        self.beta_grad = (1-self.beta_init) / float(conf['max_steps'] - self.learn_start)
        
        self.priority_queue = BinaryHeap(self.memory_size)
        self.distributions = self.build_distributions()        
        self.current = 0     # Pointer to the current saving location
        self.count = 0       # Number of collected experiences
        
    def build_distributions(self):
        '''
        Preprocess probabilities: (rank_i)^(-alpha) / sum((rank_i)^(-alpha))
        '''
        results = {}  
        
        # Creating different distributions according to the number of experiences which have been collected
        partition_size = int(math.floor(self.memory_size / self.partition_num))
        current_partition = 1
        
        for n in range(partition_size,self.memory_size+1,partition_size):
            if self.learn_start <= n:
                distribution = {}
                # P(i) = (rank_i)^(-alpha) / sum((rank_i)^(-alpha))
                pdf = list(map(lambda x: math.pow(x,-self.alpha),range(1,n+1)))
                pdf_sum = math.fsum(pdf)
                distribution['pdf'] = list(map(lambda x: x/pdf_sum, pdf))
                
                # Split each distribution to K segments, setting k = batch_size
                # strata_ends keeps start and end position of each segment
                cdf = np.cumsum(distribution['pdf'])
                strata_ends = {1: 0, self.batch_size+1: n}
                step = 1/float(self.batch_size)
                index = 1
                for s in range(2,self.batch_size+1):
                    while cdf[index] < step:
                        index += 1
                    strata_ends[s] = index
                    step += 1/float(self.batch_size)
                    
                distribution['strata_ends'] = strata_ends
                results[current_partition] = distribution
            
            current_partition += 1
            
        return results
        
    def add(self, experience):
        '''
        Stores experience: experience is a tuple of (s1,a,r,s2,t)
        '''
        self.count = max(self.count, self.current+1)
        self.current = (self.current + 1) % self.memory_size
        
        if self.current in self._experience:
            del self._experience[self.current]
        self._experience[self.current] = experience

        # Add to priority queue
        priority = self.priority_queue.get_max_priority()
        self.priority_queue.update(priority,self.current)
                    
    def retrieve(self,indices):
        '''
        Get experiences from indices
        '''
        return [self._experience[v] for v in indices]
    
    def rebalance(self):
        '''
        Rebalance priority queue
        '''
        self.priority_queue.balance_tree()
        
    def update_priority(self,indices,delta):
        '''
        Update the priority values according to new observations
        '''
        for i in range(0,len(indices)):
            self.priority_queue.update(math.fabs(delta[i]),indices[i])
    
    def sample_from_priority_replay(self,global_step):
        '''
        Samples a minibatch from exp replay
        '''
        if self.count < self.learn_start:
            print('Number of records less than learn_start. Failed')
            return 0
        
        # Get distribution according to current number of collected experiences
        dist_index = int(math.ceil(self.count / float(self.memory_size) * self.partition_num))
        distribution = self.distributions[dist_index]
        
        local_N = dist_index * math.floor(self.memory_size / self.partition_num)
                
        rank_list = []
        # Sample from K segments, with K = batch_size
        n = 1
        while len(rank_list) < self.batch_size:
            while True:  
                # Check that the range is within the number of collected samples
                min_idx = min(distribution['strata_ends'][n]+1, self.count)
                max_idx = min(distribution['strata_ends'][n+1], self.count)
                
                idx = random.randint(min_idx, max_idx)
                
                # If range is larger than current amount of samples, make sure we start from the first range 
                if distribution['strata_ends'][n+1] > self.count or distribution['strata_ends'][n+1] > self.count:            
                    n = 1

                # Get experience_id corresponding to this index
                exp_id = self.priority_queue.priority_to_experience([idx])[0]
                # If index wraps over terminal state, get new one
                if self._experience[exp_id][4] == True:
                    continue
                # Otherwise use index
                break
            n += 1
            rank_list.append(idx)
            
        # Beta increases linearly from b_0 up to 1
        beta = min(self.beta_init + (global_step - self.learn_start - 1) * self.beta_grad, 1)
        # Get probabilities P(i)
        P_i = [distribution['pdf'][v-1] for v in rank_list]
        # Compute the weights
        w = np.power(np.array(P_i) * local_N, -beta)
        w_max = max(w)
        # Normalizing the weights
        w = np.divide(w,w_max)
        # Rank_list contains priority IDs, convert them to experience IDs
        rank_e_id = self.priority_queue.priority_to_experience(rank_list)
        # Get experiences
        experience_batch = self.retrieve(rank_e_id)
        state, action, reward, next_state, terminal = map(np.array, zip(*experience_batch))
        
        return state, action , reward, next_state, terminal, w, rank_e_id
        
    def sample_from_replay(self):
        # Randomly sampling from experiences (Standard implementation)
        indexes = []
        while len(indexes) < self.batch_size:
            while True:
                index = random.randint(1,self.count-1)
                # If index wraps over terminal state, get new one
                if self._experience[index][4]:
                    continue
                # Use the index otherwise
                break
            indexes.append(index)
            
        experience_batch = self.retrieve(indexes)
        state, action , reward, next_state, terminal = map(np.array, zip(*experience_batch))
        
        return state, action, reward, next_state, terminal

## Defining our network architecture

This class will represent both our training network and our target network. 

In [6]:
class ConvNet_Estimator():
    '''Q-value estimator neural network
       This architecture will be used both for the Q-network and the Target network.
    '''
    
    def __init__(self,conf,num_actions,scope='estimator',summaries_dir=None):
        self.scope = scope
        self.num_actions = num_actions
        self.screen_width = conf['screen_width']
        self.screen_height = conf['screen_height']
        self.history_length = conf['history_length']
        self.ep_min = conf['ep_min']
        self.ep_start = conf['ep_start']
        self.ep_end_time = conf['ep_end_time']
        self.learn_start = conf['learn_start']
        self.batch_size = conf['batch_size']
        self.learning_rate = conf['learning_rate']
        
        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()
                
    def _build_model(self):
        '''
        Building the network architecture
        '''
        initializer = tf.constant_initializer(0.0)  # Change to None to remove biases from conv layers 
        
        # Our input: 4(or history_length) Frames taken from the environment
        self.X_pl = tf.placeholder(tf.float32,[None, self.screen_height, self.screen_width, self.history_length],name='X')
        # The TD target value
        self.y_pl = tf.placeholder(tf.float32,[None],name='y')
        # Integer id of selected action
        self.action_pl = tf.placeholder(tf.int32,[None],name='action')
        
        # Three convolutional layers
        conv1 = tf.contrib.layers.conv2d(self.X_pl,32,8,4,padding = 'VALID',
                                         activation_fn=tf.nn.relu,biases_initializer=initializer)
        conv2 = tf.contrib.layers.conv2d(conv1,64,4,2,padding = 'VALID',
                                         activation_fn=tf.nn.relu,biases_initializer=initializer)
        conv3 = tf.contrib.layers.conv2d(conv2,63,3,1,padding = 'VALID',
                                         activation_fn=tf.nn.relu,biases_initializer=initializer)
        
        # Fully connected layers
        flattened = tf.contrib.layers.flatten(conv3)
        fc1 = tf.contrib.layers.fully_connected(flattened,512,activation_fn=tf.nn.relu,biases_initializer=initializer)
        self.predictions = tf.contrib.layers.fully_connected(fc1,self.num_actions,biases_initializer=initializer)
        self.best_action = tf.argmax(self.predictions,dimension=1)
        
        # One hot of the action which was taken
        action_one_hot = tf.one_hot(self.action_pl,self.num_actions,1.0,0.0)
        # Get the prediction of the chosen actions only
        self.action_predictions = tf.reduce_sum(self.predictions * action_one_hot,reduction_indices=1)
        
        # DOUBLE Q-LEARNING!
        self.outputs_idx = tf.placeholder(tf.int32,[None,None])
        self.outputs_with_idx = tf.gather_nd(self.predictions,self.outputs_idx)
        
        # For rank-based priorityzed experience replay:
        default_weight = tf.ones([self.batch_size])
        self.wi_pl = tf.placeholder_with_default(default_weight, [self.batch_size])
        
        # Calculate the loss
        self.delta = self.y_pl-self.action_predictions
        # Should we operate the weights here or after clipping the deltas?
        self.weighted_delta = tf.mul(self.delta, self.wi_pl)
        
        #self.clipped_delta = tf.clip_by_value(self.delta,-1.0,1.0)        
        
        self.loss = tf.reduce_mean(self._clipped_error(self.delta))
        # self.loss = tf.reduce_mean(tf.square(self.weighted_delta))
                
        # Optimizer using parameteres from original paper
        self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate,momentum=0.95,epsilon=0.01)
        self.train_op = self.optimizer.minimize(self.loss,global_step=tf.contrib.framework.get_global_step())
        
        # Summaries for Tensorboard
        self.summaries = tf.merge_summary([
                tf.scalar_summary('loss', self.loss),
                tf.histogram_summary('q_values_hist',self.predictions),
                tf.scalar_summary('max_q_value',tf.reduce_max(self.predictions))
            ])
        
        tf.summary.scalar
        
    def _clipped_error(self, x):
        # Huber loss
        try:
            return tf.select(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5)
        except:
            return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5)

    def get_best_action(self,sess,state):
        return sess.run(self.best_action,{self.X_pl:state})
    
    def get_output_with_idx(self,sess,state,idx):
        return sess.run(self.outputs_with_idx,{self.X_pl:state, self.outputs_idx:idx})    
    
    def predict(self,sess,state):
        '''
        Predicts action values.
        '''
        return sess.run(self.predictions,{self.X_pl:state})
    
    def determine_action(self,sess,state,current_step,test_ep=None):
        '''
        Predicts action based on q values for current state and exploration strategy
        '''
        # Calculate exploration prob. epsilon => This is a decaying epsilon starting at 1 and decreasing until 0.1 once the learning process start
        ep = test_ep or (self.ep_min + max(0.,(self.ep_start-self.ep_min)*(self.ep_end_time-max(0.,current_step-self.learn_start))/self.ep_end_time))
        if random.random() < ep:
            # Explore: random action
            action = random.randrange(self.num_actions)
        else:            
            action = sess.run(self.best_action,{self.X_pl:[state]})[0]

        return action
    
    def update(self,sess,state,action,target,w_i):
        '''
        Updates the estimator towards the given targets
        '''
        feed_dict = {self.X_pl:state, self.y_pl:target, self.action_pl:action, self.wi_pl:w_i}
        q_t, summaries, _, loss, deltas = sess.run([self.predictions, self.summaries,self.train_op,self.loss, 
                                                    self.delta],feed_dict)

        return q_t, loss, deltas, summaries

## Learning agent

In [7]:
class Agent():
    def __init__(self,conf,environment,sess):
        self.conf = conf
        self.sess = sess
        self.env = environment
        self.exp_replay = Rank_Priority_Replay(self.conf)
        self.history = History(self.conf)
        self.checkpoint_dir = self.conf['checkpoint_dir']
        self.log_performance_file = self.conf['log_performance_file']
        self.log_dir = self.conf['log_dir']
        self.history_length = self.conf['history_length']
        self.learn_start = self.conf['learn_start']  
        self.train_frequency = self.conf['train_frequency']
        self.target_q_update_step = self.conf['target_q_update_step']
        self.test_step = self.conf['test_step']
        self.ep_min = self.conf['ep_min']
        self.max_steps = self.conf['max_steps']
        self.min_reward = self.conf['min_reward']
        self.max_reward = self.conf['max_reward']
        self.gamma = self.conf['gamma']
        
        self.q_estimator = ConvNet_Estimator(conf_parameters,self.env.action_size,scope='q',summaries_dir=self.log_dir)
        self.target_estimator = ConvNet_Estimator(conf_parameters,self.env.action_size,scope='target_q')
        
        with tf.variable_scope('step'):
            self.step_op = tf.Variable(0,trainable=False,name='step')
            self.step_input = tf.placeholder(tf.int32,None,name='step_input')
            self.step_assign_op = self.step_op.assign(self.step_input)
        
        # Add summaries
        with tf.variable_scope('summary'):
            scalar_summary_tags = ['average.reward', 'average.loss', 'average.q', 'episode.max reward', 'episode.min reward',\
                                  'episode.avg reward', 'episode.num of games']
            self.summary_placeholders = {}
            self.summary_ops = {}
            for tag in scalar_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(tf.float32,None,name=tag.replace(' ','_'))
                self.summary_ops[tag] = tf.scalar_summary(tag, self.summary_placeholders[tag])
            
            histogram_summary_tags = ['episode.rewards','episode.actions']
            for tag in histogram_summary_tags:
                self.summary_placeholders[tag] = tf.placeholder(tf.float32,None,name=tag.replace(' ','_'))
                self.summary_ops[tag] = tf.histogram_summary(tag, self.summary_placeholders[tag])
                
            self.writer = tf.train.SummaryWriter(self.log_dir,self.sess.graph)
        
        tf.initialize_all_variables().run()
        self.saver = tf.train.Saver(max_to_keep = 3)
        
        self.load_model()
        self.update_target_q_network(self.sess)

    def update_target_q_network(self,sess):
        e1_params = [t for t in tf.trainable_variables() if t.name.startswith(self.q_estimator.scope)]
        e1_params = sorted(e1_params, key=lambda v: v.name)
        e2_params = [t for t in tf.trainable_variables() if t.name.startswith(self.target_estimator.scope)]
        e2_params = sorted(e2_params, key=lambda v: v.name)

        update_ops = []
        for e1_v, e2_v in zip(e1_params,e2_params):
            op = e2_v.assign(e1_v)
            update_ops.append(op)

        sess.run(update_ops)
            
    def load_model(self,step=None):
        latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir)
        if latest_checkpoint:
            print('[*] Loading Checkpoint {}...\n'.format(latest_checkpoint))
            self.saver.restore(self.sess,latest_checkpoint)
        else:
            print('[!] Could not find any previous checkpoint')
        
    def save_model(self,step=None):
        f = open(self.log_performance_file, 'a')
        f.write('[*] Saving checkpoints...')
        f.close()

        self.saver.save(self.sess,os.path.join(self.checkpoint_dir,'model'),global_step=step)    
        
    def train(self):
        start_step = self.step_op.eval()

        num_game, self.update_count, ep_reward = 0, 0, 0.
        total_reward, self.total_loss, self.total_q = 0., 0., 0.
        max_avg_ep_reward = 0
        ep_rewards, actions = [], []
        
        screen,_,_,_ = self.env.new_random_game()
        
        # Stacking the screen in the first input buffer
        for _ in range(self.history_length):
            self.history.add(screen)
            
        # Training:
        for self.step in tqdm(range(start_step,self.max_steps),ncols=70,initial=start_step):
            if self.step == self.learn_start:
                num_game, self.update_count, ep_reward = 0, 0, 0.
                total_reward, self.total_loss, self.total_q = 0., 0., 0.
                ep_rewards, actions = [], []
                
            # 1. Predict: Use our training network to select an action
            action = self.q_estimator.determine_action(self.sess,self.history.get(),self.step)
            
            # 2. Execute the action
            screen, reward, terminal = self.env.execute_action(action,is_training=True)
            
            # 3. New observation
            self.observe(screen,reward,action,terminal)
            
            if terminal:
                screen,_,_,_ = self.env.new_random_game()
                num_game += 1
                ep_rewards.append(ep_reward)
                ep_reward = 0
            else:
                ep_reward += reward
                total_reward += reward
                actions.append(action)
            
            if self.step >= self.learn_start and self.step % self.test_step == self.test_step - 1:
                avg_reward = total_reward / self.test_step
                avg_loss = self.total_loss / self.update_count
                avg_q = self.total_q / self.update_count

                try:
                    max_ep_reward = np.max(ep_rewards)
                    min_ep_reward = np.min(ep_rewards)
                    avg_ep_reward = np.mean(ep_rewards)
                except:
                    max_ep_reward, min_ep_reward, avg_ep_reward = 0,0,0
                
                f = open(self.log_performance_file, 'a')
                f.write('\nAvg. reward: %.4f, Avg. loss: %.6f, Avg. Q: %3.6f' % (avg_reward, avg_loss, avg_q))
                f.write('Avg. Ep. Reward: %.4f, Max Ep. Reward: %.4f, Min Ep. Reward: %.4f, # Game: %d' \
                        % (avg_ep_reward, max_ep_reward, min_ep_reward, num_game))
                f.close()

                if max_avg_ep_reward * 0.9 <= avg_ep_reward:
                    self.step_assign_op.eval({self.step_input: self.step+1})
                    self.save_model(self.step+1)
                    max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward)

                self.inject_summary({
                        'average.reward' : avg_reward,
                        'average.loss': avg_loss,
                        'average.q': avg_q,
                        'episode.max reward':max_ep_reward,
                        'episode.min reward':min_ep_reward,
                        'episode.avg reward':avg_ep_reward,
                        'episode.num of games':num_game,
                        'episode.rewards': ep_rewards,
                        'episode.actions': actions,
                    })

                num_game = 0
                total_reward = 0.
                self.total_loss = 0.
                self.total_q = 0.
                self.update_count = 0
                ep_reward = 0
                ep_rewards = []       
                actions = []
    
    def observe(self,screen,reward,action,terminal):
        # Clip reward
        reward = max(self.min_reward,min(self.max_reward,reward))
        
        prev_state = self.history.get()
        # Add to history
        self.history.add(screen)
        
        # Add to exp. replay
        self.exp_replay.add((prev_state,reward,action,self.history.get(),terminal))  # Store tuple (s,a,r,s_t+1,terminal)
        
        if self.step > self.learn_start:
            # If it is time to train the network
            if self.step % self.train_frequency == 0:
                self.q_learning_mini_batch()
                
            # If it is time to update Target network
            if self.step % self.target_q_update_step == self.target_q_update_step -1:
                self.update_target_q_network(self.sess)
        
    def q_learning_mini_batch(self):
        if self.exp_replay.count < self.history_length or self.exp_replay.count < self.learn_start:
            # Not enough experiences
            return
        else:
            # Sample from rank-based prioritized exp. replay
            init_state, action , reward, end_state, terminal, w_i, rank_e_id = self.exp_replay.sample_from_priority_replay(self.step)
            
            # DOUBLE Q-LEARNING
            best_action_t_plus_1 = self.q_estimator.get_best_action(self.sess,end_state)
            q_t_plus_1_with_pred_action = self.target_estimator.get_output_with_idx(
                self.sess,end_state,[[idx,pred_a] for idx,pred_a in enumerate(best_action_t_plus_1)])
            terminal = np.array(terminal) + 0.
            target_q_t = (1. - terminal) * self.gamma * q_t_plus_1_with_pred_action + reward
            
            # TODO: Here I need to get the deltas (TD error) from the network
            # TODO: The weights need to be sent to the graph, to operate on the deltas when computing the loss.
            q_t, loss, deltas, summaries = self.q_estimator.update(self.sess,init_state,action,target_q_t, w_i)

            # TODO: Update experiences priorities according to deltas and rank_e_id
            self.exp_replay.update_priority(rank_e_id,deltas)

            self.writer.add_summary(summaries,self.step)
            self.total_loss += loss
            self.total_q += q_t.mean()
            self.update_count += 1      
            
    def inject_summary(self,tag_dir):
        summary_str_lists = self.sess.run([self.summary_ops[tag] for tag in tag_dir.keys()], {
                self.summary_placeholders[tag]: value for tag , value in tag_dir.items()
            })
        for summary_str in summary_str_lists:
            self.writer.add_summary(summary_str,self.step)
            
    def play(self,n_step=10000,n_episode=100,test_epsilon=None):
        if test_epsilon == None:
            test_epsilon = self.ep_min
        
        test_history = History(self.conf)
        best_reward, best_idx = 0, 0
        for idx in xrange(n_episode):
            screen,_,_,_ = self.env.new_random_game()
            current_reward = 0
            
            for _ in range(self.history_length):
                test_history.add(screen)
                
            for t in tqdm(range(n_step),ncols=70):
                # 1. Predict
                action = self.q_estimator.determine_action(self.sess,test_history.get(),0,test_epsilon)
            
                # 2. Execute the action
                screen, reward, terminal = self.env.execute_action(action,is_training=False)                
                
                # 3. New observation
                test_history.add(screen)
                
                current_reward += reward
                if terminal:
                    break
            
            if current_reward > best_reward:
                best_reward = current_reward
                best_idx = idx
                
            print('='*30)
            print('[%d] Best reward: %d' % (best_idx, best_reward))
            print('='*30)
            
        self.env.env.render(close=True)        

In [2]:
is_train = True

with tf.Session() as sess:
    env = GymEnvironment('Breakout-v0',conf_parameters)
    agent = Agent(conf_parameters,env,sess)
    
    if is_train:
        display = False
        agent.train()
    else:
        display = True
        agent.play()

NameError: name 'GymEnvironment' is not defined