In [6]:
def read_user_df(csv_path = "data/PP_users.csv"):
    pp_users = pd.read_csv(csv_path)
    pp_users.index = pp_users['u']
    pp_users['techniques'] = pp_users['techniques'].apply(ast.literal_eval)
    pp_users['items'] = pp_users['items'].apply(ast.literal_eval)
    pp_users['ratings'] = pp_users['ratings'].apply(ast.literal_eval)
    return pp_users

def read_recipe_df(csv_path = 'data/RAW_recipes.csv'):
    raw_recipes = pd.read_csv(csv_path)
    raw_recipes.ingredients = raw_recipes.ingredients.apply(ast.literal_eval)
    raw_recipes.tags = raw_recipes.tags.apply(ast.literal_eval)
    raw_recipes.nutrition = raw_recipes.nutrition.apply(ast.literal_eval)
    raw_recipes.steps = raw_recipes.steps.apply(ast.literal_eval)
    return raw_recipes
    

class UserDB():
    def __init__(self, csv_path = "data/PP_users.csv"):
        self.data = read_user_df(csv_path)
    
    def get_user(self, user_id):
        return User(self.data[self.data.u == user_id].iloc[0, :])
    
class RecipeDB():
    def __init__(self, csv_path = 'data/RAW_recipes.csv'):
        self.data = read_recipe_df(csv_path)
    
    def get_recipe(self, recipe_id):
        return Recipe(self.data[self.data.id == recipe_id].iloc[0, :])
        
class User():
    def __init__(self, series):
        self.data = series
        
    def get_recipes(self, recipe_db):
        recipe_series = recipe_db.data.set_index('id').iloc[self.data['items']]
        return [Recipe(row) for index, row in recipe_series.iterrows()]
            
    
class Recipe():
    def __init__(self, series):
        self.data = series
        
    def get_ingredients(self):
        return self.data['ingredients']
    

In [7]:
user_db = UserDB()
recipe_db = RecipeDB()

In [12]:
user_db.data

Unnamed: 0_level_0,u,techniques,items,n_items,ratings,n_ratings
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,"[8, 0, 0, 5, 6, 0, 0, 1, 0, 9, 1, 0, 0, 0, 1, ...","[1118, 27680, 32541, 137353, 16428, 28815, 658...",31,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...",31
1,1,"[11, 0, 0, 2, 12, 0, 0, 0, 0, 14, 5, 0, 0, 0, ...","[122140, 77036, 156817, 76957, 68818, 155600, ...",39,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",39
2,2,"[13, 0, 0, 7, 5, 0, 1, 2, 1, 11, 0, 1, 0, 0, 1...","[168054, 87218, 35731, 1, 20475, 9039, 124834,...",27,"[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ...",27
3,3,"[498, 13, 4, 218, 376, 3, 2, 33, 16, 591, 10, ...","[163193, 156352, 102888, 19914, 169438, 55772,...",1513,"[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ...",1513
4,4,"[161, 1, 1, 86, 93, 0, 0, 11, 2, 141, 0, 16, 0...","[72857, 38652, 160427, 55772, 119999, 141777, ...",376,"[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ...",376
...,...,...,...,...,...,...
25071,25071,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[144172, 154054]",2,"[5.0, 5.0]",2
25072,25072,"[1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[162436, 148228]",2,"[5.0, 5.0]",2
25073,25073,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[162681, 172034]",2,"[5.0, 5.0]",2
25074,25074,"[2, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[157808, 159896]",2,"[5.0, 5.0]",2


In [9]:
test_user = user_db.get_user(0).get_recipes(recipe_db)[0].get_ingredients()
test_user

['bananas',
 'rolled oats',
 'applesauce',
 'vanilla extract',
 'ground flax seeds',
 'dried cherries']

In [None]:
from gym import spaces
from gym.utils import seeding
import numpy as np

class ArmedBanditsEnv(gym.Env):
    """
    The famous k-Armed Bandit Environment, implemented for the gym interface.
    Initialization requires an array for the mean of each bandit, 
    as well as another array for the deviation from the mean for 
    each bandit. This arrays are then used to sample from the 
    distribution of a given bandit.
    """
    metadata = {'render.modes': ['human']}
    
    def __init__(self, mean, stddev):  
        super(ArmedBanditsEnv, self).__init__()
        # Define action and observation space
        self.num_bandits = mean.shape[1]
        self.num_experiments = mean.shape[0]
        self.action_space = spaces.Discrete(self.num_bandits)
        
        # Theres one state only in the k-armed bandits problem
        self.observation_space = spaces.Discrete(1)
        self.mean = mean
        self.stddev = stddev
        
    def step(self, action):
        # Sample from the specified bandit using it's reward distribution
        assert (action < self.num_bandits).all()
        
        sampled_means = self.mean[np.arange(self.num_experiments),action]
        sampled_stddevs = self.stddev[np.arange(self.num_experiments),action]
        
        reward = np.random.normal(loc=sampled_means, scale=sampled_stddevs, size=(self.num_experiments,))
        
        # Return a constant state of 0. Our environment has no terminal state
        observation, done, info = 0, False, dict()
        return observation, reward, done, info

## FOLLOWING TUTORIAL

In [214]:
import functools
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tf_agents.bandits.agents import lin_ucb_agent
from tf_agents.bandits.environments import stationary_stochastic_per_arm_py_environment as p_a_env
from tf_agents.bandits.metrics import tf_metrics as tf_bandit_metrics
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts

nest = tf.nest

In [215]:
import pandas as pd
import pandas as pd
import ast

user_recipes = pd.read_csv('user_recipes.csv')
recipe_ingredients = pd.read_csv('recipe_ingreds.csv')
user_recipes.item_ratings = user_recipes.apply(lambda x: np.insert(np.array(ast.literal_eval(x['item_ratings']), dtype = np.float64), 0, x['u']), axis = 1)
recipe_ingredients.ingredient_ids = recipe_ingredients.ingredient_ids.apply(ast.literal_eval)
recipe_ingredients.ingredients = recipe_ingredients.apply(lambda x: np.insert(np.array(ast.literal_eval(x['ingredients']), dtype = np.float64), 0, float(x['new_id'])), axis = 1)

In [216]:
user_recipes['item_ratings']
recipe_ingredients['ingredients'][0]

array([0., 0., 0., ..., 0., 0., 0.])

In [217]:
num_recipes = len(user_recipes.item_ratings[0]) - 1
num_users = len(user_recipes)
num_ingredients = len(recipe_ingredients.ingredients[0]) - 1

In [227]:
# The dimension of the global features.
GLOBAL_DIM = num_recipes + 1  #@param {type:"integer"}
# The elements of the global feature will be integers in [-GLOBAL_BOUND, GLOBAL_BOUND).
GLOBAL_BOUND = 5  #@param {type:"integer"}
# The dimension of the per-arm features.
PER_ARM_DIM = num_ingredients + 1  #@param {type:"integer"}
# The elements of the PER-ARM feature will be integers in [-PER_ARM_BOUND, PER_ARM_BOUND).
PER_ARM_BOUND = 1  #@param {type:"integer"}
# The variance of the Gaussian distribution that generates the rewards.
VARIANCE = 1  #@param {type: "number"} ****
# The elements of the linear reward parameter will be integers in [-PARAM_BOUND, PARAM_BOUND).
PARAM_BOUND = 5  #@param {type: "integer"}

NUM_ACTIONS = num_recipes - 1  #@param {type:"integer"}
BATCH_SIZE = 1  #@param {type:"integer"}

# Parameter for linear reward function acting on the
# concatenation of global and per-arm features.
reward_param = list(np.random.randint(
      -PARAM_BOUND, PARAM_BOUND, [GLOBAL_DIM + PER_ARM_DIM]))

In [228]:
recipe_ingredients

Unnamed: 0,id,new_id,ingredient_ids,ingredients
0,108065,0,"[63, 5010, 1124, 7229, 1803, 7557, 519, 3815, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,285087,1,"[840, 6906, 1910, 2499, 7449, 2683, 1388, 6473]","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,324962,2,"[7557, 5010, 1093, 3399, 3203, 7143, 2125, 755...","[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,116522,3,"[7819, 4284, 3203, 2909, 2879, 840, 4096, 2816]","[3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,180301,4,"[840, 6696, 7049, 5555, 5033]","[4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
3423,223531,3423,"[2633, 5006, 2879, 6270, 5319]","[3423.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."
3424,321698,3424,"[5724, 1045, 1871, 2757]","[3424.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."
3425,432844,3425,"[5381, 2874, 6906, 1645, 1833, 4253]","[3425.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."
3426,181420,3426,"[4717, 536, 2499, 6270, 5319, 4964, 221, 1170,...","[3426.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."


In [229]:
def get_random_user(user_recipes, num_users):
    return user_recipes.iloc[np.random.randint(num_users), :]

def get_random_recipe(num_recipes, recipe_ingredients):
    return recipe_ingredients.iloc[np.random.randint(num_recipes),:]

def get_random_user_context(user_recipes, num_users):
    return get_random_user(user_recipes, num_users)['item_ratings']

def get_random_recipe_context(recipe_ingredients, num_recipes):
    return get_random_recipe(num_recipes, recipe_ingredients)['ingredients']

In [230]:
get_random_recipe_context(recipe_ingredients, num_recipes).shape

(2785,)

In [231]:
def global_context_sampling_fn():
  """This function generates a single global observation vector."""
  return get_random_user_context(user_recipes, num_users)

def per_arm_context_sampling_fn():
  """"This function generates a single per-arm observation vector."""
  return get_random_recipe_context(recipe_ingredients, num_recipes)

def reward_fn(x):
    """This function generates a reward from the concatenated global and per-arm observations."""
    recipe_id = x[num_recipes]
    return x[1 + int(recipe_id)]

In [232]:
per_arm_py_env = p_a_env.StationaryStochasticPerArmPyEnvironment(
    global_context_sampling_fn,
    per_arm_context_sampling_fn,
    NUM_ACTIONS,
    reward_fn,
    batch_size=BATCH_SIZE
)
per_arm_tf_env = tf_py_environment.TFPyEnvironment(per_arm_py_env)

In [233]:

print('observation spec: ', per_arm_tf_env.observation_spec())
print('\nAn observation: ', per_arm_tf_env.reset().observation)

action = tf.zeros(BATCH_SIZE, dtype=tf.int32)
time_step = per_arm_tf_env.step(action)
print('\nRewards after taking an action: ', time_step.reward)

observation spec:  {'global': TensorSpec(shape=(3429,), dtype=tf.float64, name=None), 'per_arm': TensorSpec(shape=(3427, 2785), dtype=tf.float64, name=None)}

An observation:  {'global': <tf.Tensor: shape=(1, 3429), dtype=float64, numpy=array([[1547.,    0.,    0., ...,    0.,    0.,    0.]])>, 'per_arm': <tf.Tensor: shape=(1, 3427, 2785), dtype=float64, numpy=
array([[[2246.,    0.,    0., ...,    0.,    0.,    0.],
        [ 432.,    0.,    0., ...,    0.,    0.,    0.],
        [3039.,    0.,    0., ...,    0.,    0.,    0.],
        ...,
        [1557.,    0.,    0., ...,    0.,    0.,    0.],
        [1002.,    0.,    0., ...,    0.,    0.,    0.],
        [1517.,    0.,    0., ...,    0.,    0.,    0.]]])>}

Rewards after taking an action:  tf.Tensor([0.], shape=(1,), dtype=float32)


In [237]:
observation_spec = per_arm_tf_env.observation_spec()
time_step_spec = ts.time_step_spec(observation_spec)
action_spec = tensor_spec.BoundedTensorSpec(
    dtype=tf.int32, shape=(), minimum=0, maximum=NUM_ACTIONS-1)

agent = lin_ucb_agent.LinearUCBAgent(time_step_spec=time_step_spec,
                                     action_spec=action_spec,
                                     accepts_per_arm_features=True)

In [238]:

print('training data spec: ', agent.training_data_spec)

training data spec:  Trajectory(
{'action': BoundedTensorSpec(shape=(), dtype=tf.int32, name=None, minimum=array(0, dtype=int32), maximum=array(3426, dtype=int32)),
 'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'next_step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type'),
 'observation': {'global': TensorSpec(shape=(3429,), dtype=tf.float64, name=None)},
 'policy_info': PerArmPolicyInfo(log_probability=(), predicted_rewards_mean=(), multiobjective_scalarized_predicted_rewards_mean=(), predicted_rewards_optimistic=(), predicted_rewards_sampled=(), bandit_policy_type=(), chosen_arm_features=TensorSpec(shape=(2785,), dtype=tf.float64, name=None)),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})


In [239]:
print('observation spec in training: ', agent.training_data_spec.observation)

observation spec in training:  {'global': TensorSpec(shape=(3429,), dtype=tf.float64, name=None)}


In [240]:
print('chosen arm features: ', agent.training_data_spec.policy_info.chosen_arm_features)

chosen arm features:  TensorSpec(shape=(2785,), dtype=tf.float64, name=None)


In [241]:
def _all_rewards(observation):
  """Outputs rewards for all actions, given an observation."""
  global_obs = observation['global'][1:]
  return rewards

def optimal_reward(observation):
  """Outputs the maximum expected reward for every element in the batch."""
  return tf.reduce_max(_all_rewards(observation), axis=1)

regret_metric = tf_bandit_metrics.RegretMetric(optimal_reward)

In [242]:
num_iterations = 20 # @param
steps_per_loop = 1 # @param

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.policy.trajectory_spec,
    batch_size=BATCH_SIZE,
    max_length=steps_per_loop)

observers = [replay_buffer.add_batch, regret_metric]

driver = dynamic_step_driver.DynamicStepDriver(
    env=per_arm_tf_env,
    policy=agent.collect_policy,
    num_steps=steps_per_loop * BATCH_SIZE,
    observers=observers)

regret_values = []

for _ in range(num_iterations):
  print(_)
  driver.run()
  loss_info = agent.train(replay_buffer.gather_all())
  replay_buffer.clear()
  regret_values.append(regret_metric.result())

0


ValueError: Tensor conversion requested dtype float64 for Tensor with dtype float32: <tf.Tensor: shape=(1, 2785), dtype=float32, numpy=array([[3425.,    0.,    0., ...,    0.,    0.,    0.]], dtype=float32)>