In [2]:
import gymnasium as gym
import torch

In [7]:
def run_episode(env: gym.Env, policy):
    state, _ = env.reset()
    rewards = []
    states = [state]
    is_done = False
    is_truncated = False
    while not is_done and not is_truncated:
        action = policy[state].item()
        state, reward, is_truncated, is_done, info = env.step(action)
        states.append(state)
        rewards.append(reward)
        if is_done or is_truncated:
            break
    states = torch.tensor(states)
    rewards = torch.tensor(rewards)
    return states, rewards


In [8]:
def mc_prediction_first_visit(env, policy, gamma, n_episode):
    n_state = policy.shape[0]
    V = torch.zeros(n_state)
    N = torch.zeros(n_state)
    for episode in range(n_episode):
        states_t, rewards_t = run_episode(env, policy)
        return_t = 0
        first_visit = torch.zeros(n_state)
        G = torch.zeros(n_state)
        for state_t, reward_t in zip(reversed(states_t)[1:], reversed(rewards_t)):
            return_t = gamma * return_t + reward_t
            G[state_t] = return_t
            first_visit[state_t] = 1
        for state in range(n_state):
            if first_visit[state] > 0:
                V[state] += G[state]
                N[state] += 1
    for state in range(n_state):
        if N[state] > 0:
            V[state] = V[state] / N[state]
    return V



In [9]:
def mc_prediction_every_visit(env, policy, gamma, n_episode):
    n_state = policy.shape[0]
    V = torch.zeros(n_state)
    N = torch.zeros(n_state)
    G = torch.zeros(n_state)
    for episode in range(n_episode):
        states_t, rewards_t = run_episode(env, policy)
        return_t = 0
        for state_t, reward_t in zip(reversed(states_t)[1:], reversed(rewards_t)):
            return_t = gamma * return_t + reward_t
            G[state_t] += return_t
            N[state_t] += 1
    for state in range(n_state):
        if N[state] > 0:
            V[state] = G[state] / N[state]
    return V

In [10]:
env = gym.make('FrozenLake-v1')
gamma = 1
n_episode = 10000
optimal_policy = torch.tensor([0., 3., 3., 3., 0., 3., 2., 3., 3., 1., 0., 3., 3., 2., 1., 3.])

value = mc_prediction_first_visit(env, optimal_policy, gamma, n_episode)
print('The value function calculated by first-visit MC prediction:\n', value)


The value function calculated by first-visit MC prediction:
 tensor([0.7384, 0.5007, 0.4965, 0.4389, 0.7384, 0.0000, 0.3893, 0.0000, 0.7384,
        0.7398, 0.6662, 0.0000, 0.0000, 0.8000, 0.8934, 0.0000])


In [11]:
value = mc_prediction_every_visit(env, optimal_policy, gamma, n_episode)
print('The value function calculated by every-visit MC prediction:\n', value)


The value function calculated by every-visit MC prediction:
 tensor([0.6133, 0.4255, 0.3892, 0.3550, 0.6151, 0.0000, 0.3673, 0.0000, 0.6357,
        0.6737, 0.6367, 0.0000, 0.0000, 0.7640, 0.8759, 0.0000])
