# Steam (Environment - Agent)

In [1]:
# Basic import
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy
from scipy.stats import norm
import pdb
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, Dot, Concatenate
from tensorflow.keras.models import Model
tf.config.experimental_run_functions_eagerly(True)
print(tf.__version__)

2.0.0


## Environment

In [49]:
class Environment:
    """ 
    Contextual Multi-Armed Bandit environment
    """
    def __init__(self, nb_films, nb_users, 
                 context_size = 2,
                 displayed_users_embedding_size = 2, #used for the features vector
                 displayed_games_embedding_size = 2, #used for the features vector
                 noise_size = 3,
                 seed=None):     
        
        self._rng = np.random.RandomState(seed)
        #-------------------------------------------------------#
        
        self._nb_games = nb_games
        self._nb_users = nb_users
        self._p = context_size # size of user, size of game
        self._displayed_users_embedding_size = displayed_users_embedding_size
        self._displayed_games_embedding_size = displayed_games_embedding_size
        self._noise_size = noise_size
   
        #-------------------------------------------------------#
    
        self.user_mean = np.ones(self._p)
        self.user_var = np.ones(self._p)
        self.game_mean = np.ones(self._p)
        self.game_var = np.ones(self._p)
        
        #-------------------------------------------------------#
        self.finish = False # flag to know when reset the environment (all games played)
    
    def step(self):
        
        if self._available_games.sum() == 0:#if all players played all games
            self.finish = True
            print("All games played reset the environment")
            return 0,0, self.finish
            
        """ Choose a game """
        user = self.get_next_user()#always a user that have at least one gama still to play
        
        
        available_games = np.where(self._available_games[user] == 1)[0]
        

        
        return user, available_games, self.finish
    
    def get_next_user(self):
        
        user = self._rng.randint(0, self._nb_users)
        
        if np.sum(self._available_games[user,:]) > 0:#still some games to play
            return user
        else:#all games played for the current user--> change and find a random one between the one who still have some games
            row,cols = np.where(self._available_games == 1)
            #pdb.set_trace()
            ret = self._rng.choice(row)
            return ret
    
    def update(self, user, game):
        reward = self._reward_matrix[user, game]
        self._available_games[user, game] = 0
        return reward
    
    def reset(self):
        self.finish = False
        self._users = self._rng.normal(loc=self.user_mean,
                                                scale=self.user_var,
                                                size=(self._nb_users, self._p))
        self._games = self._rng.normal(loc=self.game_mean,
                                                scale=self.game_var,
                                                size=(self._nb_games, self._p))
        
        z_mean = self.user_mean.dot(self.game_mean)
        z_var = self.user_var.dot(self.game_var) + self.user_var.dot(np.square(self.game_mean)) + \
                self.game_var.dot(np.square(self.user_mean))
        z = norm(z_mean, np.sqrt(z_var))
        self.z_cut_points = z.ppf([0.2, 0.4, 0.6, 0.8]) # buckets
        self._available_games = np.ones((nb_users, nb_games))        
        
        self._reward_matrix = np.zeros((nb_users, nb_games))
        
        for i in range(self._reward_matrix.shape[0]):
            for j in range(self._reward_matrix.shape[1]):
                real_score = self._users[i].dot(self._games[j])
                self._reward_matrix[i, j] = np.searchsorted(self.z_cut_points, real_score) + 1

        users = deepcopy(self._users)
        return users

    def get_feature_vector(self, user, game):
        user_embedding = self._users[user]
        game_embedding = self._games[game]
        
        if self._displayed_users_embedding_size + self._displayed_games_embedding_size > 0:
            variables = np.array([user_embedding[:self._displayed_users_embedding_size],
                                  game_embedding[:self._displayed_games_embedding_size]])

            if self._noise_size > 0:
                noise = self._rng.normal(loc=np.ones(self._noise_size),
                                         scale=np.ones(self._noise_size),
                                         size=self._noise_size)
                
                variables = np.append(variables, noise)
                
            return variables

## Agent

In [50]:
class RandomAgent:
    """ 
    Random agent
    """
    def __init__(self, seed = None):
        self._rng = np.random.RandomState(seed)
    
    def act(self, available_games):
        action = self._rng.choice(available_games)
        return action

## Experiment

In [51]:
# Basic parameter
nb_users = 30 #number of users in the context
nb_games = 10 #number of games in the context
context_size = 2 #number of different film categories = 2

In [52]:
# Creating the environment
env = Environment(nb_games,nb_users,context_size,seed=2020)
env.reset() #reset and initilize the environment

array([[-0.76884571,  1.07555227],
       [-0.1306297 ,  0.34856983],
       [ 0.10688437, -0.27410098],
       [ 0.93884557,  1.06451384],
       [ 1.41011295,  0.42711751],
       [ 0.19866638,  2.31203519],
       [ 2.27469887, -0.2143576 ],
       [ 1.31371941, -0.44482142],
       [ 0.6310387 ,  0.23077342],
       [ 1.3926161 ,  1.05729383],
       [ 3.08997884,  1.04197131],
       [ 0.95165928,  0.48684608],
       [ 0.91541072, -0.21545008],
       [-0.41293073, -0.48691055],
       [ 1.38222486,  1.937673  ],
       [ 2.77267804,  1.87882801],
       [ 1.33171912,  0.69396433],
       [ 2.24026615,  0.78437316],
       [ 1.15592948,  1.09805553],
       [ 1.83209585,  3.04520542],
       [ 0.68318608, -0.31283291],
       [-0.75445746,  1.10209408],
       [-0.36150208,  1.48178488],
       [ 0.79167126,  0.90813649],
       [ 1.70268816,  1.10365506],
       [ 1.62123638,  1.95411497],
       [ 3.03781352,  0.51554878],
       [ 1.2071549 ,  2.64424216],
       [ 0.5117926 ,

In [53]:
# Creating the agent
agent = RandomAgent(2020)

We run the experiment and generate some historical data.

In [54]:
# Running several trials
nb_iteration = 10000 #how many trials
rating_matrix = np.zeros((env._nb_users, env._nb_games))
users = list()
games = list()
ratings = list()
for i in range(nb_iteration):
    user, available_games, finish = env.step()
    if finish:
        print("Maybe too many trial try to reduce and reset the environment")
        break
    choosen_game = agent.act(available_games)
    reward = env.update(user, choosen_game)
    users.append(user)
    games.append(choosen_game)
    ratings.append(reward)
    rating_matrix[user, choosen_game] = reward
    '''
    print("user = {}, recommended_games = {}, choosen_game = {}".format(user,recommended_games,choosen_game))
    print("reward = {}\n".format(reward))
    '''
    
print("rating matrix: \n", str(rating_matrix))

300.0
299.0
298.0
297.0
296.0
295.0
294.0
293.0
292.0
291.0
290.0
289.0
288.0
287.0
286.0
285.0
284.0
283.0
282.0
281.0
280.0
279.0
278.0
277.0
276.0
275.0
274.0
273.0
272.0
271.0
270.0
269.0
268.0
267.0
266.0
265.0
264.0
263.0
262.0
261.0
260.0
259.0
258.0
257.0
256.0
255.0
254.0
253.0
252.0
251.0
250.0
249.0
248.0
247.0
246.0
245.0
244.0
243.0
242.0
241.0
240.0
239.0
238.0
237.0
236.0
235.0
234.0
233.0
232.0
231.0
230.0
229.0
228.0
227.0
226.0
225.0
224.0
223.0
222.0
221.0
220.0
219.0
218.0
217.0
216.0
215.0
214.0
213.0
212.0
211.0
210.0
209.0
208.0
207.0
206.0
205.0
204.0
203.0
202.0
201.0
200.0
199.0
198.0
197.0
196.0
195.0
194.0
193.0
192.0
191.0
190.0
189.0
188.0
187.0
186.0
185.0
184.0
183.0
182.0
181.0
180.0
179.0
178.0
177.0
176.0
175.0
174.0
173.0
172.0
171.0
170.0
169.0
168.0
167.0
166.0
165.0
164.0
163.0
162.0
161.0
160.0
159.0
158.0
157.0
156.0
155.0
154.0
153.0
152.0
151.0
150.0
149.0
148.0
147.0
146.0
145.0
144.0
143.0
142.0
141.0
140.0
139.0
138.0
137.0
136.0
135.0
134.

## Regression model

In [8]:
class RegressionModel(Model):
    def __init__(self, embedding_size, max_user, max_game):
        super().__init__()
        
        self.user_embedding = Embedding(output_dim=embedding_size,
                                        input_dim=max_user,
                                        input_length=1,
                                        name='user_embedding')
        self.game_embedding = Embedding(output_dim=embedding_size,
                                        input_dim=max_game,
                                        input_length=1,
                                        name='game_embedding')
        self.flatten = Flatten()
        self.dot = Dot(axes=1)
        
    def call(self, inputs):
        user_inputs = inputs[0]
        game_inputs = inputs[1]
        
        user_vecs = self.flatten(self.user_embedding(user_inputs))
        game_vecs = self.flatten(self.game_embedding(game_inputs))
        
        y = self.dot([user_vecs, game_vecs])
        return y

## Deep regression model

In [9]:
class DeepRegressionModel(Model):

    def __init__(self, embedding_size, max_user, max_game):
        super().__init__()
        
        self.user_embedding = Embedding(output_dim=embedding_size,
                                        input_dim=max_user,
                                        input_length=1,
                                        name='user_embedding')
        self.game_embedding = Embedding(output_dim=embedding_size,
                                        input_dim=max_game,
                                        input_length=1,
                                        name='game_embedding')
        
        self.flatten = Flatten()
        self.concat = Concatenate()
        
        self.dense1 = Dense(16, activation="relu")
        self.dense2 = Dense(8, activation="relu")
        
    def call(self, inputs, training=False):
        user_inputs = inputs[0]
        game_inputs = inputs[1]
        feature_inputs = inputs[2]
        
        user_vecs = self.flatten(self.user_embedding(user_inputs))
        game_vecs = self.flatten(self.game_embedding(game_inputs))
        
        # input_vecs = self.concat([user_vecs, game_vecs, self.flatten(feature_inputs)])
        input_vecs = self.concat([user_vecs, game_vecs])
        
        y = self.dense1(input_vecs)
        y = self.dense2(y)
        
        return y

## Embedding agent

In [10]:
class EmbeddingAgent:
    def __init__(self, X, Y, deepRegression=False):
        if deepRegression:
            self._model = DeepRegressionModel(64, nb_users, nb_games) ## passare nb
        else:
            self._model = RegressionModel(64, nb_users, nb_games)
        self._model.compile(optimizer="adam", loss='mae')
        self._model.fit(X, Y,
                  batch_size=64, epochs=100, validation_split=0.1,
                  shuffle=True)
        self._user_embeddings = self._model.get_weights()[0]
        self._game_embeddings = self._model.get_weights()[1]
    
    def act(self, user, available_games):
        user_embedding = self._user_embeddings[user]
        dot_products = self._game_embeddings @ user_embedding
        user_embedding_norm = np.linalg.norm(user_embedding)
        all_item_norms = np.linalg.norm(self._game_embeddings, axis=1)
        norm_products = user_embedding_norm * all_item_norms
        sims = dot_products / (norm_products)
        sims = np.argsort(sims)[::-1]
        mask = np.in1d(sims, available_games)
        sims = sims[mask]
        return sims[0]

In [15]:
deepRegression = False

users = np.array(users)
games = np.array(games)
ratings = np.array(ratings)

if deepRegression:
    features = []
    for i in range(len(users)):
        features.append(env.get_feature_vector(users[i], games[i]))
    features = np.float64(features)
    agent = EmbeddingAgent([users, games, features], ratings, deepRegression=deepRegression)
else:
    agent = EmbeddingAgent([users, games], ratings, deepRegression=deepRegression)

Train on 90 samples, validate on 10 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epo

In [16]:
# Running several trials
nb_iteration = 20 #how many trials
for i in range(nb_iteration):
    user, available_games, finish = env.step()
    if finish:
        print("Maybe too many trial try to reduce and reset the environment")
        break
    choosen_game = agent.act(user, available_games)
    reward = env.update(user, choosen_game)
    rating_matrix[user, choosen_game] = reward
    print("user = {}, available games = {}, choosen_game = {}".format(user,available_games,choosen_game))
    print("reward = {}\n".format(reward))

user = 24, available games = [1 2 3 4 5 6 7 8], choosen_game = 8
reward = 4.0

user = 12, available games = [1 3 5 6 7 8 9], choosen_game = 6
reward = 1.0

user = 23, available games = [0 1 3 4 5 7 8 9], choosen_game = 5
reward = 3.0

user = 26, available games = [0 1 2 3 4 7 9], choosen_game = 1
reward = 4.0

user = 26, available games = [0 2 3 4 7 9], choosen_game = 2
reward = 5.0

user = 29, available games = [0 4 5 6 8 9], choosen_game = 8
reward = 4.0

user = 1, available games = [0 1 5 6 8], choosen_game = 0
reward = 2.0

user = 8, available games = [3 4 6], choosen_game = 3
reward = 2.0

user = 7, available games = [0 4 6 7 8], choosen_game = 8
reward = 2.0

user = 12, available games = [1 3 5 7 8 9], choosen_game = 5
reward = 2.0

user = 5, available games = [0 2 4 5 7], choosen_game = 7
reward = 4.0

user = 27, available games = [2 5], choosen_game = 2
reward = 3.0

user = 14, available games = [1 2 3 4 5 6 9], choosen_game = 1
reward = 4.0

user = 15, available games = [0 1 2