<a href="https://colab.research.google.com/github/adityapal99/MultiArmedBanditStudy/blob/main/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multi Armed Bandit Example

In [1]:
!pip3 install tf-agents


import sys
sys.path.append('/usr/local/lib/python3.6/site-packages/')

Collecting tf-agents
  Downloading tf_agents-0.8.0-py3-none-any.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.4 MB/s 
Collecting tensorflow-probability==0.12.2
  Downloading tensorflow_probability-0.12.2-py2.py3-none-any.whl (4.8 MB)
[K     |████████████████████████████████| 4.8 MB 48.4 MB/s 
Installing collected packages: tensorflow-probability, tf-agents
  Attempting uninstall: tensorflow-probability
    Found existing installation: tensorflow-probability 0.13.0
    Uninstalling tensorflow-probability-0.13.0:
      Successfully uninstalled tensorflow-probability-0.13.0
Successfully installed tensorflow-probability-0.12.2 tf-agents-0.8.0


In [1]:
import abc
import numpy as np
import tensorflow as tf
import pandas as pd

from tf_agents.agents import tf_agent
from tf_agents.drivers import driver
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.policies import tf_policy
from tf_agents.specs import array_spec
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts
from tf_agents.trajectories import trajectory
from tf_agents.trajectories import policy_step

import pprint

# Clear any leftover state from previous colabs run.
# (This is not necessary for normal programs.)
tf.compat.v1.reset_default_graph()

tf.compat.v1.enable_resource_variables()
tf.compat.v1.enable_v2_behavior()
nest = tf.compat.v2.nest

## Setting Up Python Environment using Tensorflow

- Creating an Abstract Method to modify the environment based on requirements
- Using `tf_agents.environment.py_environment.PyEnvironment` and inheriting it to create the environment.

In [None]:
class AbstractBanditPyEnvironment(py_environment.PyEnvironment):
    def __init__(self, observation_spec, action_spec):
        self._observation_spec = observation_spec
        self._action_spec = action_spec
        super(AbstractBanditPyEnvironment, self).__init__()

    # Helper functions.
    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def _empty_observation(self):
        return tf.nest.map_structure(lambda x: np.zeros(x.shape, x.dtype), self.observation_spec())

    # These two functions below should not be overridden by subclasses.
    def _reset(self):
        """Returns a time step containing an observation."""
        return ts.restart(self._observe(), batch_size=self.batch_size)

    def _step(self, action):
        """Returns a time step containing the reward for the action taken."""
        reward = self._apply_action(action)
        return ts.termination(self._observe(), reward)

    # These two functions below are to be implemented in subclasses.
    @abc.abstractmethod
    def _observe(self):
        """Returns an observation."""

    @abc.abstractmethod
    def _apply_action(self, action):
        """Applies `action` to the Environment and returns the corresponding reward.
        """



### Custom MultiArmedBandit Implementation

- Observe the method `MultiArmedBanditPyEnvironment._observe()`
    - Here we will set the observation algorithm based on sales or something.
    - Right now I am choosing a random observation based on price. But later more fields will be responsible for changing the `_observation` variable.

- Observer the method `MultiArmedBanditPyEnvironment._apply_action()`
    - Here we will use a formula to calculate the reward based on action and observation.
    - Right now a very simple `max` function is being used.
    - Later on we will use something like <strong>Softmax</strong> or <strong>Epsilon Greedy</strong>



In [None]:
class MultiArmedBanditPyEnvironment(AbstractBanditPyEnvironment):
    def __init__(self, max_price: np.float64, min_price: np.float64):
        self.max_price, self.min_price = max_price, min_price
        action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=2, name='action')
        observation_spec = array_spec.BoundedArraySpec(
            shape=(1,), dtype=np.int32, minimum=-2, maximum=2, name='observation')
        super(MultiArmedBanditPyEnvironment, self).__init__(observation_spec, action_spec)

    def _observe(self):
        self._observation = np.random.random() * (self.max_price - self.min_price) + self.min_price
        return self._observation

    def _apply_action(self, action):
        return max(action, self._observation)

In [24]:
value = np.random.random()

10.185117389997751


In [3]:
df = pd.read_csv('/content/sample_data/ElectronicsProductsPricingData.csv', encoding="UTF-8")
df.head()

test_cases_for_each_product = 10
price_predictions = list()


for row, value in df[['prices.amountMax', 'prices.amountMin']].iterrows():
    bandit = MultiArmedBanditPyEnvironment(max_price=value.array[0], min_price=value.array[1])
    first_obs = bandit.reset().observation

    price_predictions.append({'observations': [first_obs, ], 'rewards': []})
    for _ in range(10):
        action = np.random.random() * (bandit.max_price - bandit.min_price) + bandit.min_price
        result = bandit.step(action)
        price_predictions[row]['rewards'].append(result.reward)
        price_predictions[row]['observations'].append(result.observation)


In [6]:
pprint.pprint(price_predictions[:5])

[{'observations': [104.99,
                   104.99,
                   104.99,
                   104.99,
                   104.99,
                   104.99,
                   104.99,
                   104.99,
                   104.99,
                   104.99,
                   104.99],
  'rewards': [array(104.99, dtype=float32),
              array(104.99, dtype=float32),
              array(104.99, dtype=float32),
              array(104.99, dtype=float32),
              array(104.99, dtype=float32),
              array(104.99, dtype=float32),
              array(104.99, dtype=float32),
              array(104.99, dtype=float32),
              array(104.99, dtype=float32),
              array(104.99, dtype=float32)]},
 {'observations': [67.18574479266941,
                   68.85869234640361,
                   68.1847721008489,
                   65.75546474673988,
                   68.58260215891899,
                   68.1176324531035,
                   68.186444870410

In [None]:
environment = MultiArmedBanditPyEnvironment()
observation = environment.reset().observation
print(f"{observation = }")

action = 2
print(f"{action = }")
reward = environment.step(action).reward
print(f"{reward = }")

observation = array([0])
action = 2
reward = array([0.], dtype=float32)


In [None]:
tf_environment = tf_py_environment.TFPyEnvironment(environment)

In [None]:
class SignPolicy(tf_policy.TFPolicy):
    def __init__(self):
        observation_spec = tensor_spec.BoundedTensorSpec(
            shape=(1,), dtype=tf.int32, minimum=-2, maximum=2)
        time_step_spec = ts.time_step_spec(observation_spec)

        action_spec = tensor_spec.BoundedTensorSpec(
            shape=(), dtype=tf.int32, minimum=0, maximum=2)

        super(SignPolicy, self).__init__(time_step_spec=time_step_spec,
                                        action_spec=action_spec)
    def _distribution(self, time_step):
        pass

    def _variables(self):
        return ()

    def _action(self, time_step, policy_state, seed):
        observation_sign = tf.cast(tf.sign(time_step.observation[0]), dtype=tf.int32)
        action = observation_sign + 1
        return policy_step.PolicyStep(action, policy_state)

In [None]:
sign_policy = SignPolicy()

current_time_step = tf_environment.reset()
print(f'{current_time_step.observation = }')

action = sign_policy.action(current_time_step).action
print(f'{action = }')
reward = tf_environment.step(action).reward
print(f'{reward = }')

current_time_step.observation = <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[1]])>
action = <tf.Tensor: shape=(1,), dtype=int32, numpy=array([2])>
reward = <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[2.]], dtype=float32)>
