Some of the below code has been inspired from the [Ptan](https://github.com/Shmuma/ptan) library.

**What are action selectors?**

Action selectors select actions using some strategy. Common action selectors are:
- `ArgmaxActionSelector`
- `EpsilonGreedyActionSelector`
- `ProbabilityActionSelector`
- `UCBActionSelector` *(coming soon)*
- `EpsilonSoftActionSelector` *(coming soon)*

Researchers can also come up with their own action selector, as required by their reinforcement learning problem.

In [2]:
import numpy as np

In [6]:
class ArgmaxActionSelector:
    """Selects actions using argmax."""

    def __call__(self, scores):
        """
        :param scores: np.array

        :return: np.array
        """
        return np.argmax(scores, axis=1)
    

In [70]:
class EpsilonGreedyActionSelector:
    """Selects actions using epsilon-greedy strategy.
    
    Random numbers are generated from the uniform distribution
    in the half-open interval [0.0, 1.0).

    We compare each one of them with the epsilon value.

    Exploration:    
        - If the random number is less than epsilon,
        then a random action is selected.
    Exploitation:
        - If the random number is greater than epsilon,
        then the action with the highest score is selected. 
    """

    def __init__(self, eps=0.05, selector=None):
        """
        :param eps: float
        :param selector: this is an action selector, and can be one of the following:
            - ArgmaxActionSelector
            - ProbabilityActionSelector
            - other custom user-defined action selector
        """
        self._eps = eps
        if selector is None:
            self.selector = ArgmaxActionSelector()
        self.selector = selector

    @property
    def epsilon(self):
        """
        Returns the epsilon value, which is a float.

        This is the getter method.
        """
        return self._eps
    
    @epsilon.setter
    def epsilon(self, value):
        """
        :param value: float.

        This is the setter method.
        """
        if value < 0.0 or value > 1.0:
            raise ValueError("Epsilon value must be between 0.0 and 1.0, inclusive.")
        self._eps = value

    def __call__(self, scores):
        """
        :param scores: np.array

        :return: np.array
        """
        assert len(scores.shape) == 2, "scores must be a 2D array"
        batch_size, n_actions = scores.shape
        # the action space contains numbers from 0 till n_actions,
        # where n_actions is an integer
        actions = self.selector(scores)
        rand_nums = np.random.random(size=batch_size)
        mask = rand_nums < self.epsilon
        # sum(mask) will tell how many random actions to generate
        # n_actions gives the number of actions to choose from
        # so rand_actions will be a 1D array of random
        # actions of size sum(mask) that contains integers
        # from 0 till n_actions
        rand_actions = np.random.choice(n_actions, sum(mask))
        # fill random actions in the positions where mask is True
        # this means that we do exploration in these positions
        actions[mask] = rand_actions
        return actions
    

In [64]:
def fn():
    eps = 0.1
    bs = 10
    n_actions = 4
    nums = np.random.random(bs)
    actions = np.arange(100, 100 + bs)
    mask = nums < eps
    rand_actions = np.random.choice(n_actions, sum(mask))
    actions[mask] = rand_actions
    print(actions)
    print(rand_actions)
    print(sum(mask))
    print(mask)


fn()

[100 101 102 103 104 105 106 107 108 109]
[]
0
[False False False False False False False False False False]


In [71]:
class ProbabilityActionSelector:
    """
    Converts probabilities of actions into action by sampling them.
    """
    def __call__(self, probs):
        """
        :param probs: np.array

        :return: np.array
        """
        actions = []
        for prob in probs:
            # sample an action from the probability distribution
            num = np.random.choice(len(probs), p=prob)
            actions.append(num)
        return np.array(actions)
    

In [72]:
class EpsilonTracker:
    """Updates epsilon according to linear schedule."""
    def __init__(self, selector, eps_start, eps_final, eps_frames):
        """
        :param selector: EpsilonGreedyActionSelector
        :param eps_start: int or float
        :param eps_final: int or float
        :param eps_frames: int
        """
        self.selector = selector
        self.eps_start = eps_start
        self.eps_final = eps_final
        self.eps_frames = eps_frames
        self.frame(0)

    def frame(self, frame):
        """
        :param frame: int
        """
        eps = (self.eps_start - frame) / self.eps_frames
        self.selector.epsilon = max(self.eps_final, eps)


In [None]:
# TODO: implement upper-confidence bound action selector
class UCBActionSelector:
    """
    Selects actions using the upper-confidence bound (UCB) strategy.
    """
    def __init__(self, c):
        """
        :param c: float
        """
        self.c = c

    def __call__(self, scores):
        """
        :param scores: np.array

        :return: np.array
        """
        pass

In [None]:
# TODO: implement epsilon soft action selector
class EpsilonSoftActionSelector:
    """Selects actions using epsilon-soft strategy.
    
    Random numbers are generated from the uniform distribution
    in the half-open interval [0.0, 1.0).

    We compare each one of them with the epsilon value.

    Exploration:    
        - If the random number is less than epsilon,
        then a random action is selected.
    Exploitation:
        - If the random number is greater than epsilon,
        then the action with the highest score is selected. 
    """

    def __init__(self, eps=0.05, selector=None):
        """
        :param eps: float
        :param selector: this is an action selector, and can be one of the following:
            - ArgmaxActionSelector
            - ProbabilityActionSelector
            - other custom user-defined action selector
        """
        self._eps = eps
        if selector is None:
            self.selector = ArgmaxActionSelector()
        self.selector = selector

    @property
    def epsilon(self):
        """
        Returns the epsilon value, which is a float.

        This is the getter method.
        """
        return self._eps
    
    @epsilon.setter
    def epsilon(self, value):
        """
        :param value: float.

        This is the setter method.
        """
        if value < 0.0 or value > 1.0:
            raise ValueError("Epsilon value must be between 0.0 and 1.0, inclusive.")
        self._eps = value

    def __call__(self, scores):
        """
        :param scores: np.array

        :return: np.array
        """
        assert len(scores.shape) == 2, "scores must be a 2D array"
        batch_size, n_actions = scores.shape
        # the action space contains numbers from 0 till n_actions,
        # where n_actions is an integer
        actions = self.selector(scores)
        rand_nums = np.random.random(size=batch_size)
        mask = rand_nums < self.epsilon
        # sum(mask) will tell how many random actions to generate
        # n_actions gives the number of actions to choose from
        # so rand_actions will be a 1D array of random
        # actions of size sum(mask) that contains integers
        # from 0 till n_actions
        rand_actions = np.random.choice(n_actions, sum(mask))
        # fill random actions in the positions where mask is True
        # this means that we do exploration in these positions
        actions[mask] = rand_actions
        return actions
    