## Implementation of Prioritized Experience Replay

Implementing the rank-based approach. Code based on contribution from [Damcy](https://github.com/Damcy/prioritized-experience-replay).

In [1]:
import math
import random
import numpy as np
from utils import BinaryHeap
import gym
import cv2
import os

## Configuration parameters

Configuration parameteres as presented in the [original](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html) paper.

In [22]:
conf_parameters = {
    'max_steps': 50000000,
    'test_step': 50000,
    # Input size
    'screen_width': 84,
    'screen_height': 84,
    'history_length': 4,
    
    'memory_size': 1000000,       # Replay memory size
    'batch_size': 32,             # Number of training cases ove which SGD update is computed
    'gamma': 0.99,                # Discount factor
    'learning_rate': 0.00025,     # Learning rate
    'learn_start': 50000,         # Replay start size
    'random_start': 30,           # Maximum number of 'do nothing' actions at the start of an episode
    
    # Exploration parameters
    'ep_min': 0.1,                 # Final exploration
    'ep_start': 1.0,               # Initial exploration
    'ep_end_time': 1000000,        # Final exploration frame

    'target_q_update_step': 10000, # Target network update frequency

    # Train batch
    'train_frequency': 4,          # Update frequency
    
    # Clip rewards
    'min_reward': -1.0,
    'max_reward': 1.0,

    # How many times should the same action be taken
    'action_repeat': 4,
    
    # Prioritized experience replay parameters
    'alpha': 0.7,
    'beta_init': 0.5,
    'part_number': 100
}

# Whether or not to render the environment 
display = False

checkpoint_dir = 'Models/Prio_Exp/'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
    
log_dir = 'Logs/Prio_Exp/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    
log_performance_file = 'Logs/Prio_Exp.txt'

In [3]:
class GymEnvironment():
    def __init__(self,name,conf):
        self.env = gym.make(name)            # Initialize Gym environment
        self._screen = None
        self.screen_width = conf['screen_width']
        self.screen_height = conf['screen_height']
        self.random_start = conf['random_start']
        
    def execute_action(self,action,is_training=True):
        # This function execute the selected action for 'action_repeat' number of times and returns the cumulative reward
        # and final state
        cum_reward = 0
        start_lives = self.env.ale.lives()
        
        for _ in xrange(action_repeat):
            self._screen, reward, terminal, _ = self.env.step(action)
            cum_reward += reward
            
            if is_training and start_lives > self.env.ale.lives():
                cum_reward -= 1
                terminal = True
                
            if terminal:
                break
                
        reward = cum_reward
        self.render()
        return self.screen, reward, terminal
    
    @property
    def action_size(self):
        return self.env.action_space.n        # Number of available actions

    @property
    def screen(self):     # Method to resize the screen provided by gym to the desired values
        return cv2.resize(cv2.cvtColor(self._screen,cv2.COLOR_RGB2GRAY)/255.,(self.screen_width,self.screen_height))
    
    def new_game(self):
        if self.env.ale.lives() == 0:
            self._screen = self.env.reset()
        
        self._screen, reward, terminal, _ = self.env.step(0)
        self.render()
        return self.screen, 0, 0, terminal
    
    def render(self):    # Renders the environment only if display == True
        if display:
            self.env.render()
    
    def new_random_game(self):  # Starts a random new game after doing 'nothing' for a random number of frames
        _,_,_,terminal = self.new_game()
        for _ in xrange(random.randint(0,self.random_start-1)):
            self._screen, reward, terminal, _ = self.env.step(0)
        
        self.render()
        return self.screen, 0, 0, terminal

In [4]:
class History:
    def __init__(self, conf):
        self.history = np.zeros([conf['history_length'],conf['screen_height'],conf['screen_width']],dtype=np.float32)
        
    def add(self, screen):
        self.history[:-1] = self.history[1:]
        self.history[-1] = screen
        
    def get(self):
        return np.transpose(self.history,(1,2,0))

In [31]:
class Rank_Priority_Replay():
    def __init__(self,conf):
        self.memory_size = conf['memory_size']
        # These are the arrays where we will store the experiences
        self._experience = {}
        
        self.alpha = conf['alpha']
        self.beta_init = conf['beta_init']
        self.batch_size = conf['batch_size']
        self.partition_num = conf['part_number']    # Split total size to N segments
        self.learn_start = conf['learn_start']    
        self.beta_grad = (1-self.beta_init) / float(conf['max_steps'] - self.learn_start)
        
        self.priority_queue = BinaryHeap(self.memory_size)
        self.distributions = self.build_distributions()        
        self.current = 0     # Pointer to the current saving location
        self.count = 0       # Number of collected experiences
        
    def build_distributions(self):
        '''
        Preprocess probabilities: (rank_i)^(-alpha) / sum((rank_i)^(-alpha))
        '''
        results = {}  
        
        # Creating different distributions according to the number of experiences which have been collected
        partition_size = int(math.floor(self.memory_size / self.partition_num))
        current_partition = 1
        
        for n in range(partition_size,self.memory_size+1,partition_size):
            if self.learn_start <= n:
                distribution = {}
                # P(i) = (rank_i)^(-alpha) / sum((rank_i)^(-alpha))
                pdf = list(map(lambda x: math.pow(x,-self.alpha),range(1,n+1)))
                pdf_sum = math.fsum(pdf)
                distribution['pdf'] = list(map(lambda x: x/pdf_sum, pdf))
                
                # Split each distribution to K segments, setting k = batch_size
                # strata_ends keeps start and end position of each segment
                cdf = np.cumsum(distribution['pdf'])
                strata_ends = {1: 0, self.batch_size+1: n}
                step = 1/float(self.batch_size)
                index = 1
                for s in range(2,self.batch_size+1):
                    while cdf[index] < step:
                        index += 1
                    strata_ends[s] = index
                    step += 1/float(self.batch_size)
                    
                distribution['strata_ends'] = strata_ends
                results[current_partition] = distribution
            
            current_partition += 1
            
        return results
        
    def add(self, experience):
        '''
        Stores experience: experience is a tuple of (s1,a,r,s2,t)
        '''
        self.count = max(self.count, self.current+1)
        self.current = (self.current + 1) % self.memory_size
        
        if self.current in self._experience:
            del self._experience[self.current]
        self._experience[self.current] = experience

        # Add to priority queue
        priority = self.priority_queue.get_max_priority()
        self.priority_queue.update(priority,self.current)
                    
    def retrieve(self,indices):
        '''
        Get experiences from indices
        '''
        return [self._experience[v] for v in indices]
    
    def rebalance(self):
        '''
        Rebalance priority queue
        '''
        self.priority_queue.balance_tree()
        
    def update_priority(self,indices,delta):
        '''
        Update the priority values according to new observations
        '''
        for i in range(0,len(indices)):
            self.priority_queue.update(math.fabs(delta[i]),indices[i])
    
    def sample_from_priority_replay(self,global_step):
        '''
        Samples a minibatch from exp replay
        '''
        if self.count < self.learn_start:
            print('Number of records less than learn_start. Failed')
            return 0
        
        # Get distribution according to current number of collected experiences
        dist_index = int(math.ceil(self.count / float(self.memory_size) * self.partition_num))
        distribution = self.distributions[dist_index]
        
        local_N = dist_index * math.floor(self.memory_size / self.partition_num)
                
        rank_list = []
        # Sample from K segments, with K = batch_size
        n = 1
        while len(rank_list) < self.batch_size:
            while True:           
                # Check that the range is within the number of collected samples
                if distribution['strata_ends'][n+1] > self.count:
                    # Limit the range to be within available size
                    idx = random.randint(distribution['strata_ends'][n]+1, self.count)
                else:
                    idx = random.randint(distribution['strata_ends'][n]+1, distribution['strata_ends'][n+1])
                # Get experience_id corresponding to this index
                exp_id = self.priority_queue.priority_to_experience([idx])[0]
                # If index wraps over terminal state, get new one
                if self._experience[exp_id][4] == True:
                    continue
                # Otherwise use the index making sure we start from the first interval if necessary
                if distribution['strata_ends'][n+1] > self.count:
                    n = 1
                break
            n += 1
            rank_list.append(idx)
            
        # Beta increases linearly from b_0 up to 1
        beta = min(self.beta_init + (global_step - self.learn_start - 1) * self.beta_grad, 1)
        # Get probabilities P(i)
        P_i = [distribution['pdf'][v-1] for v in rank_list]
        # Compute the weights
        w = np.power(np.array(P_i) * local_N, -beta)
        w_max = max(w)
        # Normalizing the weights
        w = np.divide(w,w_max)
        # Rank_list contains priority IDs, convert them to experience IDs
        rank_e_id = self.priority_queue.priority_to_experience(rank_list)
        # Get experiences
        experience_batch = self.retrieve(rank_e_id)
        state, action, reward, next_state, terminal = map(np.array, zip(*experience_batch))
        
        return state, action , reward, next_state, terminal, w, rank_e_id
        
    def sample_from_replay(self):
        # Sample random indexes
        indexes = []
        while len(indexes) < self.batch_size:
            while True:
                index = random.randint(1,self.count-1)
                # If index wraps over terminal state, get new one
                if self._experience[index][4]:
                    continue
                # Use the index otherwise
                break
            indexes.append(index)
            
        experience_batch = self.retrieve(indexes)
        state, action , reward, next_state, terminal = map(np.array, zip(*experience_batch))
        
        return state, action, reward, next_state, terminal 

In [32]:
env = GymEnvironment('Breakout-v0',conf_parameters)

history = History(conf_parameters)
screen,_,_,_ = env.new_random_game()
        
# Stacking the screen in the first input buffer
for _ in range(4):
    history.add(screen)
    
replay = Rank_Priority_Replay(conf_parameters)
for i in range(50010):
    replay.add((history.get(),i,1,history.get(),False))
    
print('done')

#s,a,r,st,t = replay.sample_from_replay()
s2,a2,r2,st2,t2,w,ids = replay.sample_from_priority_replay(1)

[2016-12-16 15:14:41,699] Making new env: Breakout-v0


done
