In [1]:
import numpy as np 
import gym

In [11]:
# create aFood Truck class with gym.Env module
class FoodTruck(gym.Env):
    def __init__(self):
        self.v_demand = [100, 200, 300, 400]
        self.p_demand = [0.3, 0.4, 0.2, 0.1]
        self.capacity = self.v_demand[-1]
        self.days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', "Weekend"]
        self.unit_cost = 4
        self.net_revenue = 7
        self.action_space = [0, 100, 200, 300, 400]
        self.state_space = [("Mon", 0)] + [(d, i) for d in self.days[1:] 
                                           for i in [0, 100, 200, 300]]
        
    def get_next_state_reward(self, state, action, demand):
        day, inventory = state
        result = {}
        result['next_day'] = self.days[self.days.index(day) + 1]
        result['starting_inventory'] = min(self.capacity, inventory + action)
        result['cost'] = self.unit_cost * action
        result['sales'] = min(result['starting_inventory'], demand)
        result['revenue'] = self.net_revenue * result['sales']
        result['next_inventory'] = result['starting_inventory'] - result['sales']
        result['reward'] = result['revenue'] - result['cost']
        return result
    
    def get_transition_prob(self, state, action):
        next_s_r_prob = {}
        for ix, demand in enumerate(self.v_demand):
            result = self.get_next_state_reward(state, action, demand)
            next_s = (result['next_day'], result['next_inventory'])
            reward = result['reward']
            prob = self.p_demand[ix]
            if (next_s, reward) not in next_s_r_prob:
                next_s_r_prob[next_s, reward] = prob
            else:
                next_s_r_prob[next_s, reward] = prob
        return next_s_r_prob
    
    def reset(self):
        self.day = "Mon"
        self.inventory = 0
        state = (self.day, self.inventory)
        return state
    
    def is_terminal(self, state):
        day, inventory = state
        if day == "Weekend":
            return True
        else:
            return False
        
    def step(self, action):
        demand = np.random.choice(self.v_demand, p=self.p_demand)
        result = self.get_next_state_reward((self.day, self.inventory), 
                                           action, demand)
        self.day = result['next_day']
        self.inventory = result['next_inventory']
        state = (self.day, self.inventory)
        reward = result['reward']
        done = self.is_terminal(state)
        info = {'demand': demand, 'sales': result['sales']}
        return state, reward, done, info

In [12]:
# simulating an arbitrary policy
np.random.seed(0)
foodtruck = FoodTruck()
rewards = []
for i_episode in range(10000):
    state = foodtruck.reset()
    done = False
    ep_reward = 0
    while not done:
        day, inventory = state
        action = max(0, 300 - inventory)
        state, reward, done, info = foodtruck.step(action)
        ep_reward += reward
    rewards.append(ep_reward)
np.mean(rewards)

2590.83

In [14]:
# single day expected reward
ucost = 4
uprice = 7
v_demand = [100, 200, 300, 400]
p_demand = [0.3, 0.4, 0.2, 0.1]
inv = 400
profit = uprice * np.sum([p_demand[i]*min(v_demand[i], inv) for i in range(4)]) - inv * ucost
print(profit)

-130.0


In [5]:
# v_demand = [100, 200, 300, 400]
# p_demand = [0.3, 0.4, 0.2, 0.1]
# capacity = v_demand[-1]
# days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', "Weekend"]
# unit_cost = 4
# net_revenue = 7
# action_space = [0, 100, 200, 300, 400]
# state_space = [("Mon", 0)] + [(d, i) for d in days[1:] for i in [0, 100, 200, 300]]
# state_space

[('Mon', 0),
 ('Tue', 0),
 ('Tue', 100),
 ('Tue', 200),
 ('Tue', 300),
 ('Wed', 0),
 ('Wed', 100),
 ('Wed', 200),
 ('Wed', 300),
 ('Thu', 0),
 ('Thu', 100),
 ('Thu', 200),
 ('Thu', 300),
 ('Fri', 0),
 ('Fri', 100),
 ('Fri', 200),
 ('Fri', 300),
 ('Weekend', 0),
 ('Weekend', 100),
 ('Weekend', 200),
 ('Weekend', 300)]

In [9]:
# def get_next_state_reward(state, action, demand):
#     day, inventory = state
#     result = {}
#     result['next_day'] = days[days.index(day) + 1]
#     result['starting_inventory'] = min(capacity, inventory + action)
#     result['cost'] = unit_cost * action
#     result['sales'] = min(result['starting_inventory'], demand)
#     result['revenue'] = net_revenue * result['sales']
#     result['next_inventory'] = result['starting_inventory'] - result['sales']
#     result['reward'] = result['revenue'] - result['cost']
#     return result
    
# next_s_r_prob = {}
# for ix, demand in enumerate(v_demand):
#     result = get_next_state_reward(state_space[1], action_space[-1], demand)
#     next_s = (result['next_day'], result['next_inventory'])
#     reward = result['reward']
#     prob = p_demand[ix]
#     if (next_s, reward) not in next_s_r_prob:
#         next_s_r_prob[next_s, reward] = prob
#     else:
#         next_s_r_prob[next_s, reward] = prob
# next_s_r_prob

{(('Wed', 300), -900): 0.3,
 (('Wed', 200), -200): 0.4,
 (('Wed', 100), 500): 0.2,
 (('Wed', 0), 1200): 0.1}

## Policy Evaluation

In [15]:
def base_policy(states):
    policy = {}
    for s in states:
        day, inventory = s 
        prob_a = []
        if inventory >= 300:
            prob_a[0] = 1
        else:
            prob_a[200 - inventory] = 0.5
            prob_a[300 - inventory] = 0.5
        policy[s] = prob_a
    return policy