## Contextual Bandits
Here we create a very basic Agent and Environment we will choose a film among the ones proposed, at random.

In [1]:
# Basic import
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from copy import copy, deepcopy
from scipy.stats import invgamma, gamma
from scipy.stats import t as student

In [12]:
class RandomAgent:
    """ Random agent. """
    def __init__(self, seed = None):
        self._rng = np.random.RandomState(seed)
    
    def act(self, user_id, recommended_films):
        action = self._rng.choice(recommended_films)
        return action

In [23]:
class CMAB:
    """ 
    Contextual Multi-Armed Bandit environment with bernoulli rewards.
    """
    def __init__(self, nb_films, nb_users, context_size, seed=None):
        """ mean_reward_a = coef_{0,a} + \sum_{j,a} x_{j,a}*beta_{j,a} """
        self._nb_films = nb_films
        self._nb_users = nb_users
        self._p = context_size # size of user, size of film
        self._rng = np.random.RandomState(seed)
        self._films = self._rng.uniform(size=(nb_films, context_size))
        self._users = self._rng.uniform(size=(nb_users, context_size))
        self._reward_matrix = np.zeros((nb_users, nb_films))
        for i in range(self._reward_matrix.shape[0]):
            for j in range(self._reward_matrix.shape[1]):
                film_norm = np.linalg.norm(self._films[j])
                user_norm = np.linalg.norm(self._users[i])
                reward = np.linalg.norm(self._films[j] - self._users[i], ord=2) 
                self._reward_matrix[i, j] = reward
        self._reward_matrix = (self._reward_matrix / np.max(self._reward_matrix) * 4).astype(int) + 1
        self._available_films = np.ones((nb_users, nb_films))

    def step(self):
        """ Play an action """
        user = self._rng.randint(0, self._nb_users)
        available_films = np.where(self._available_films[user] == 1)[0]
        return user, available_films
    
    def update(self, user, film):
        reward = self._reward_matrix[user, film]
        self._available_films[user, film] = 0
        return reward
    
    def reset(self):
        self._users = self._rng.uniform(size=(self._nb_users, self._p))
        self._reward_matrix = np.zeros((nb_users, nb_films))
        for i in range(self._reward_matrix.shape[0]):
            for j in range(self._reward_matrix.shape[1]):
                film_norm = np.linalg.norm(self._films[j])
                user_norm = np.linalg.norm(self._users[i])
                reward = np.linalg.norm(self._films[j] - self._users[i], ord=2) 
                self._reward_matrix[i, j] = reward
        self._reward_matrix = (self._reward_matrix / np.max(self._reward_matrix) * 4).astype(int) + 1
        self._available_films = np.ones((nb_users, nb_films))
        users = deepcopy(self._users)
        return users

In [24]:
#Basic parameter

nb_users = 10 #number of users in the context
nb_films = 10 #number of films in the context
nb_features = 2 #number of different film categories
nb_iteration = 20 #how many trials

In [25]:
#Creating the environment

env = CMAB(nb_films,nb_users,nb_features)
env.reset() #reset and initilize the environment

array([[0.99950726, 0.92374656],
       [0.27108366, 0.8205857 ],
       [0.26191241, 0.38855628],
       [0.76975835, 0.13069826],
       [0.70841285, 0.49818746],
       [0.64278088, 0.74700825],
       [0.04899137, 0.28122565],
       [0.43999946, 0.48934559],
       [0.58861554, 0.80239152],
       [0.60564912, 0.48580851]])

In [26]:
#creating the agent

agent = RandomAgent(2020)

In [28]:
#running several trials

for i in range(nb_iteration):
    user_id,recommended_films = env.step()
    print(recommended_films)
    choosen_film = agent.act(user_id,recommended_films)
    print("user_id ={}, recommended_films = {}, choosen_film = {}".format(user_id,recommended_films,choosen_film))
    reward = env.update(user_id, choosen_film)
    print("reward = {}\n".format(reward))


[0 1 2 3 4 5 6 7 9]
user_id =7, recommended_films = [0 1 2 3 4 5 6 7 9], choosen_film = 9
reward = 1

[1 2 3 5 6 7 8 9]
user_id =0, recommended_films = [1 2 3 5 6 7 8 9], choosen_film = 8
reward = 4

[1 2 5 6 8]
user_id =9, recommended_films = [1 2 5 6 8], choosen_film = 8
reward = 3

[0 1 2 3 4 5 6 7 8 9]
user_id =5, recommended_films = [0 1 2 3 4 5 6 7 8 9], choosen_film = 1
reward = 1

[0 1 2 3 4 5 6 8 9]
user_id =2, recommended_films = [0 1 2 3 4 5 6 8 9], choosen_film = 1
reward = 3

[0 2 3 4 5 6 8 9]
user_id =2, recommended_films = [0 2 3 4 5 6 8 9], choosen_film = 9
reward = 1

[0 1 2 3 4 5 6 7 8 9]
user_id =8, recommended_films = [0 1 2 3 4 5 6 7 8 9], choosen_film = 5
reward = 2

[0 1 2 3 4 7 9]
user_id =3, recommended_films = [0 1 2 3 4 7 9], choosen_film = 2
reward = 4

[0 1 2 3 4 6 7 8 9]
user_id =8, recommended_films = [0 1 2 3 4 6 7 8 9], choosen_film = 6
reward = 2

[1 2 5 6]
user_id =9, recommended_films = [1 2 5 6], choosen_film = 2
reward = 2

[0 1 2 3 4 7 8 9]
user_i