In [3]:
import gymnasium as gym
import torch

In [4]:
env = gym.make("FrozenLake-v1", render_mode="ansi")

n_state = env.observation_space.n
n_action = env.action_space.n


In [6]:
state, _ = env.reset()
print(env.render())


[41mS[0mFFF
FHFH
FFFH
HFFG



In [13]:
gamma = 0.99
threshold = 0.0001

In [12]:

V_res = torch.zeros(n_state)

for state in range(n_state):
    v_actions = torch.zeros(n_action)
    for action in range(n_action):
        for trans_prob, new_state, reward, _ in env.unwrapped.P[state][action]:
            #print(f"state={state}, action={action}, new_state={new_state} trans_prob={trans_prob},  reward={reward}")
            v_actions[action] += trans_prob * (reward + gamma * V_res[new_state])
    #print(f'state={state} v_actions={v_actions} max={torch.max(v_actions)} ')

state=0 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=1 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=2 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=3 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=4 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=5 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=6 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=7 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=8 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=9 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=10 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=11 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=12 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=13 v_actions=tensor([0., 0., 0., 0.]) max=0.0 
state=14 v_actions=tensor([0.0000, 0.3333, 0.3333, 0.3333]) max=0.3333333432674408 
state=15 v_actions=tensor([0., 0., 0., 0.]) max=0.0 


In [14]:
def value_iteration(env: gym.Env, gamma, threshold):
    """
    Solve a given environment with value iteration algorithm
    @param env: OpenAI Gym environment
    @param gamma: discount factor
    @param threshold: the evaluation will stop once values for all states are less than the threshold
    @return: values of the optimal policy for the given environment
    """
    n_state = env.observation_space.n
    n_action = env.action_space.n
    V = torch.zeros(n_state)
    while True:
        V_temp = torch.empty(n_state)
        for state in range(n_state):
            v_actions = torch.zeros(n_action)
            for action in range(n_action):
                for trans_prob, new_state, reward, _ in env.unwrapped.P[state][action]:
                    # Суммируем все переходы из state в new_state
                    v_actions[action] += trans_prob * (reward + gamma * V[new_state])
            # находим action с максимальным значением - это будет оптимальное значение ценности для state        
            V_temp[state] = torch.max(v_actions)
        # контроль сходимости к оптимальному значению
        max_delta = torch.max(torch.abs(V - V_temp))
        V = V_temp.clone()
        if max_delta <= threshold:
            break
    return V


In [15]:
V_optimal = value_iteration(env, gamma, threshold)
print('Optimal values:\n{}'.format(V_optimal))


Optimal values:
tensor([0.5404, 0.4966, 0.4681, 0.4541, 0.5569, 0.0000, 0.3572, 0.0000, 0.5905,
        0.6421, 0.6144, 0.0000, 0.0000, 0.7410, 0.8625, 0.0000])


In [17]:
def extract_optimal_policy(env: gym.Env, V_optimal, gamma):
    """
    Obtain the optimal policy based on the optimal values
    @param env: OpenAI Gym environment
    @param V_optimal: optimal values
    @param gamma: discount factor
    @return: optimal policy
    """
    n_state = env.observation_space.n
    n_action = env.action_space.n
    optimal_policy = torch.zeros(n_state)
    for state in range(n_state):
        v_actions = torch.zeros(n_action)
        for action in range(n_action):
            for trans_prob, new_state, reward, _ in env.unwrapped.P[state][action]:
                v_actions[action] += trans_prob * (reward + gamma * V_optimal[new_state])
        optimal_policy[state] = torch.argmax(v_actions)
    return optimal_policy


In [18]:
optimal_policy = extract_optimal_policy(env, V_optimal, gamma)
print('Optimal policy:\n{}'.format(optimal_policy))


Optimal policy:
tensor([0., 3., 3., 3., 0., 0., 0., 0., 3., 1., 0., 0., 0., 2., 1., 0.])


In [19]:
def run_episode(env: gym.Env, policy):
    state, _ = env.reset()
    episode_reward = 0
    while True:
        action = policy[state].item()
        state, reward, is_truncated, is_done, info = env.step(action)
        episode_reward += reward
        if is_done or is_truncated:
            break
    return episode_reward

In [20]:
n_episode = 1000
total_rewards = []
for episode in range(n_episode):
    total_reward = run_episode(env, optimal_policy)
    total_rewards.append(total_reward)

print('Average total reward under the optimal policy: {}'.format(sum(total_rewards) / n_episode))



Average total reward under the optimal policy: 0.76
