In [1]:
import numpy as np
import gym

In [2]:
# As every action from each state is chosen using an epsilon-greedy
# policy, there is no need to store an explicit policy. Instead this
# function can be called whenever an acion needs to be selected.
def epsilon_greedy_policy(Q, state, epsilon):
    q_values = Q[state]
    max_q = np.max(q_values)
    rand = np.random.rand()
    if rand <= epsilon:
        return np.random.randint(len(q_values)) 
    else:
        # Arbitarily chooses an action from tied (if any) max action-values
        return np.random.choice(np.argwhere(q_values == max_q).flatten())

In [3]:
def on_policy_monte_carlo(env, Q, N, episodes, epsilon, gamma, render=False):
    for episode in range(episodes):
        state = env.reset()
        sample = []
        while True:
            if render:
                env.render()
            action = epsilon_greedy_policy(Q, state, epsilon)
            state, reward, done, _ = env.step(action)
            sample = np.append(sample, (action, reward, state))
            if done:
                sample = sample[:-1] # Removing redundant last state
                G = 0 # G = Return
                # Looping through sample backwards to incrementally adjust return 
                # and avoid having to calculate the same parts multiple times for 
                # all state-action pairs.
                for i in range(len(sample)-1, 0, -3):
                    S, A, R = int(sample[i-2]), int(sample[i-1]), sample[i]
                    G = R + gamma * G
                    N[S, A] = N[S, A] + 1 # N = count each state-action pair has been visited
                    Q[S, A] = Q[S, A] + (1 / N[S, A]) * (G - Q[S, A])
                break
    env.close()
    return Q

In [4]:
def success_rate(env, Q, runs):
    goals_reached = 0
    for episode in range(runs):
        state = env.reset()
        while True:
            action = np.argmax(Q[state])
            state, reward, done, _ = env.step(action)
            if done:
                goals_reached += reward
                break
    env.close()
    
    print("{} goals reached in {} runs.".format(goals_reached, runs))
    print("Accurracy: {}%".format((goals_reached / runs) * 100 ))

In [6]:
env = gym.make("FrozenLake-v0")
env.observation_space, env.action_space

(Discrete(16), Discrete(4))

In [7]:
# All action-values will be stored in Q (16 states all with 4 possible actions)
Q = np.zeros((env.observation_space.n, env.action_space.n))
# Count for how many times each state-action pair has been visited
N = np.zeros((env.observation_space.n, env.action_space.n))
epsilon = 0.1
gamma = 0.7

In [8]:
Q = on_policy_monte_carlo(env, Q, N, 100000, epsilon, gamma)
print("Action-values:\n", Q)
success_rate(env, Q, 100)

Action-values:
 [[0.00305897 0.00260874 0.00242546 0.00165169]
 [0.00517387 0.00516486 0.00514099 0.00443338]
 [0.00770452 0.00877772 0.00868061 0.00175448]
 [0.00318169 0.00326304 0.0014384  0.00378942]
 [0.00801093 0.00663437 0.00615736 0.00269845]
 [0.         0.         0.         0.        ]
 [0.03075269 0.03097753 0.03319792 0.00315831]
 [0.         0.         0.         0.        ]
 [0.00728788 0.02189034 0.01751979 0.02379878]
 [0.04172358 0.07296763 0.06747161 0.03773244]
 [0.13495538 0.11610201 0.11163687 0.02562988]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.05258514 0.14948462 0.16324656 0.12893918]
 [0.17618077 0.48133033 0.47019283 0.39796134]
 [0.         0.         0.         0.        ]]
70.0 goals reached in 100 runs.
Accurracy: 70.0%
