In [1]:
import numpy as np
from food_truck_env import FoodTruck

In [2]:
def choose_action(state, policy):
    prob_a = policy[state]
    action = np.random.choice(a=list(prob_a.keys()), p=list(prob_a.values()))

    return action

In [3]:
def first_visit_return(returns, trajectory, gamma):
    """
    Returns is a dict that has states as keys and values of these states as values
    Trajectory is a sample trajectory including state, action, reward in each element

    Starts from last element of trajectory and calculates G for each state. Then appends this G to returns for corresponding state returns. 
    """
    G = 0
    T = len(trajectory) - 1
    for t, sar in enumerate(reversed(trajectory)):
        s, a, r = sar
        G = r + gamma * G

        first_visit = True
        for j in range(T - t):
            if s == trajectory[j][0]:
                first_visit = False

        if first_visit:
            if s in returns:
                returns[s].append(G)
            else:
                returns[s] = [G]

    return returns

In [4]:
def get_trajectory(env, policy):
    """
    Creates a trajectory in given environment and policy
    """
    trajectory = []
    state = env.reset()
    done = False
    sar = [state]
    
    while not done:
        action = choose_action(state, policy)
        state, reward, done, _ = env.step(action)
        
        sar.append(action)
        sar.append(reward)
        trajectory.append(sar)
        
        sar = [state]
    return trajectory

In [5]:
def first_visit_mc(env, policy, gamma, n_trajectories):
    """
    Finds the state-value function for given policy
    """
    returns = {}
    v = {}
    for i in range(n_trajectories):
        trajectory = get_trajectory(env, policy)
        returns = first_visit_return(returns, trajectory, gamma)
        
    for s in env.state_space:
        if s in returns:
            v[s] = np.round(np.mean(returns[s]), 1)
            
    return v

Policy defined below is same we used in dynamic programming notebook

In [6]:
def some_policy(states):
    policy = {}
    for s in states:
        day, inventory = s
        prob_a = {}
        
        if inventory >= 300:
            prob_a[0] = 1
        else:
            prob_a[200 - inventory] = 0.5
            prob_a[300 - inventory] = 0.5

        policy[s] = prob_a
    
    return policy

In [7]:
env = FoodTruck()
policy = some_policy(env.state_space)

In [8]:
v_estimate = first_visit_mc(env, policy, 1, 1000)
print("Expected weekly profit for some policy is: ", v_estimate["Mon", 0])

Expected weekly profit for some policy is:  2478.1


We again see that MC prediction predicted weekly profit given policy correct 

In [9]:
def get_eps_greedy(actions, eps, a_best):
    """
    Assigns probability to each action
    
    If there are 4 actions and eps=0.4, Best action gets 0.7 probability and other actions get 0.1 probability
    """
    prob_a = {}
    for a in actions:
        if a == a_best:
            prob_a[a] = 1 - eps + eps / len(actions)
        else:
            prob_a[a] = eps / len(actions)
            
    return prob_a

In [10]:
def get_random_policy(states, actions):
    policy = {}
    for s in states:
        policy[s] = {a: 1 / len(actions) for a in actions}
        
    return policy

In [11]:
def on_policy_first_visit_mc(env, n_iter, eps, gamma):
    states =  env.state_space
    actions = env.action_space
    policy =  get_random_policy(states, actions)
    
    Q = {s: {a: 0 for a in actions} for s in states}
    N = {s: {a: 0 for a in actions} for s in states}
    
    for i in range(n_iter):
        if i % 10000 == 0:
            print(f"Iteration: {i}")
        
        trajectory = get_trajectory(env, policy)
        
        G = 0
        T = len(trajectory) - 1
        for t, sar in enumerate(reversed(trajectory)):
            s, a, r = sar
            G = r + gamma * G
            first_visit = True
            
            for j in range(T - t):
                s_j = trajectory[j][0]
                a_j = trajectory[j][1]
                if (s, a) == (s_j, a_j):
                    first_visit = False
                    
            if first_visit:
                Q[s][a] = N[s][a] * Q[s][a] + G
                N[s][a] += 1
                Q[s][a] /= N[s][a]
                
                a_best = max(Q[s], key=Q[s].get)
                policy[s] = get_eps_greedy(actions, eps, a_best)
                
    return policy, Q, N

In [12]:
policy, Q, Q_n = on_policy_first_visit_mc(env, 300000, 0.05, 1)
policy

Iteration: 0
Iteration: 10000
Iteration: 20000
Iteration: 30000
Iteration: 40000
Iteration: 50000
Iteration: 60000
Iteration: 70000
Iteration: 80000
Iteration: 90000
Iteration: 100000
Iteration: 110000
Iteration: 120000
Iteration: 130000
Iteration: 140000
Iteration: 150000
Iteration: 160000
Iteration: 170000
Iteration: 180000
Iteration: 190000
Iteration: 200000
Iteration: 210000
Iteration: 220000
Iteration: 230000
Iteration: 240000
Iteration: 250000
Iteration: 260000
Iteration: 270000
Iteration: 280000
Iteration: 290000


{('Mon', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.96},
 ('Tue', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.96},
 ('Tue', 100): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},
 ('Tue', 200): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},
 ('Tue', 300): {0: 0.01, 100: 0.96, 200: 0.01, 300: 0.01, 400: 0.01},
 ('Wed', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.96},
 ('Wed', 100): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},
 ('Wed', 200): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},
 ('Wed', 300): {0: 0.01, 100: 0.96, 200: 0.01, 300: 0.01, 400: 0.01},
 ('Thu', 0): {0: 0.01, 100: 0.01, 200: 0.01, 300: 0.96, 400: 0.01},
 ('Thu', 100): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},
 ('Thu', 200): {0: 0.01, 100: 0.96, 200: 0.01, 300: 0.01, 400: 0.01},
 ('Thu', 300): {0: 0.96, 100: 0.01, 200: 0.01, 300: 0.01, 400: 0.01},
 ('Fri', 0): {0: 0.01, 100: 0.01, 200: 0.96, 300: 0.01, 400: 0.01},
 ('Fri', 100): {0: 0.01, 100: 

Again, we found the optimal policy with on policy MC control