In [560]:
import warnings ; warnings.filterwarnings('ignore')

import gymnasium as gym
import gym_walk
import gym_aima
import numpy as np
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

import itertools
from itertools import cycle
import tabulate

import random

np.set_printoptions(suppress=True)
np.random.seed(123)
random.seed(123)

In [561]:
for i in gym.envs.registry.keys():
    print(i, end='___')

CartPole-v0___CartPole-v1___MountainCar-v0___MountainCarContinuous-v0___Pendulum-v1___Acrobot-v1___phys2d/CartPole-v0___phys2d/CartPole-v1___phys2d/Pendulum-v0___LunarLander-v3___LunarLanderContinuous-v3___BipedalWalker-v3___BipedalWalkerHardcore-v3___CarRacing-v3___Blackjack-v1___FrozenLake-v1___FrozenLake8x8-v1___CliffWalking-v0___Taxi-v3___tabular/Blackjack-v0___tabular/CliffWalking-v0___Reacher-v2___Reacher-v4___Reacher-v5___Pusher-v2___Pusher-v4___Pusher-v5___InvertedPendulum-v2___InvertedPendulum-v4___InvertedPendulum-v5___InvertedDoublePendulum-v2___InvertedDoublePendulum-v4___InvertedDoublePendulum-v5___HalfCheetah-v2___HalfCheetah-v3___HalfCheetah-v4___HalfCheetah-v5___Hopper-v2___Hopper-v3___Hopper-v4___Hopper-v5___Swimmer-v2___Swimmer-v3___Swimmer-v4___Swimmer-v5___Walker2d-v2___Walker2d-v3___Walker2d-v4___Walker2d-v5___Ant-v2___Ant-v3___Ant-v4___Ant-v5___Humanoid-v2___Humanoid-v3___Humanoid-v4___Humanoid-v5___HumanoidStandup-v2___HumanoidStandup-v4___HumanoidStandup-v5___Gy

In [562]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='policy:'):
    print(title)
    arrs = {k:v for k, v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi(s)
        print("| ", end='')
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=' ')
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=' ')
        if (s + 1) % n_cols == 0: print('|')

In [563]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end='')
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=' ')
        else:
            print(str(s).zfill(2), '{}'.format(np.round(v, prec)).rjust(6), end=' ')
        if (s + 1) % n_cols == 0: print('|')

In [564]:
def print_action_value_function(
    Q, optimal_Q=None, action_symbols=('<', '>'), prec=3, title='Action-value function:'):
    
    print(title)
    vf_types = ('',) if optimal_Q is None else ('', '*', 'err')
    headers = ['s',] + [" ".join(i) for i in list(itertools.product(vf_types, action_symbols))]
    states = np.arange(len(Q))[..., np.newaxis]
    arr = np.hstack((states, np.round(Q, prec)))
    if not (optimal_Q is None):
        arr = np.hstack((arr, np.round(optimal_Q, prec), np.round(optimal_Q-Q, prec)))
    print(tabulate(arr, headers, tablefmt='fancy_grid'))

In [565]:
def probability_success(env, pi, goal_state, n_episodes=100, max_steps=200):
    results = []
    for _ in range(n_episodes):
        (state, info), done, steps = env.reset(), False, 0
        while not done and steps < max_steps:
            action = pi(state)
            state, reward, done, truncated, info = env.step(action)
            steps += 1
        results.append(state == goal_state)
    return int(np.sum(results) / len(results) * 100)

In [566]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    results = []
    for _ in range(n_episodes):
        (state, info), done, steps = env.reset(), False, 0
        results.append(0.0)
        while not done and steps < max_steps:
            action = pi(state)
            state, reward, done, truncated, info = env.step(action)
            results[-1] += reward
            steps += 1
    return np.mean(results)

# Slippery Walk Five MDP and sample policy

In [567]:
from gymnasium.envs.registration import register

register(
    id='SlipperyWalkFive-v0',
    entry_point='gym_walk.envs:WalkEnv',
    # left-most and right-most states are terminal
    kwargs={'n_states': 5, 'p_stay': 0.5*2/3., 'p_backward': 0.5*1/3.},
    max_episode_steps=100,
    reward_threshold=1.0,
    nondeterministic=True,
)

In [568]:
def print_policy_evaluation(env, pi, goal_state=6):
    p_success = probability_success(env, pi, goal_state)
    m_return = mean_return(env, pi)
    print(f'Reaches goal {p_success}%. Obtains an average undiscounted return of {m_return:.4f}.')

In [569]:
env = gym.make('SlipperyWalkFive-v0')

P = env.unwrapped.P
init_state = env.reset()
goal_state = 6

LEFT, RIGHT = range(2)

pi_left = {0:LEFT, 1:LEFT, 2:LEFT, 3:LEFT, 4:LEFT, 5:LEFT, 6:LEFT}
pi_right = {0:RIGHT, 1:RIGHT, 2:RIGHT, 3:RIGHT, 4:RIGHT, 5:RIGHT, 6:RIGHT}
pi = lambda s: pi_left[s]

print_policy(pi, P, action_symbols=('<', '>'), n_cols=7)
print_policy_evaluation(env, pi, P)

policy:
| 00      < | 01      < | 02      < | 03      < | 04      < | 05      < | 06      < |
Reaches goal 0%. Obtains an average undiscounted return of 0.0200.


# Policy Evaluation

In [570]:
def policy_evaluation(pi, P, gamma=1, theta=1e-10):
    prev_V = np.zeros(len(P), dtype=np.float64)
    step = 0
    while True:
        V = np.zeros(len(P), dtype=np.float64)
        for s in range(len(P)):
            for prob, next_state, reward, done in P[s][pi(s)]:
                # print(f"prob__{prob:f}   state__{s}   next_state__{next_state}   reward__{reward}   value__{ '' if V[s] == 0 else f'{V[s]:f}'}")
                V[s] += prob * (reward + gamma * prev_V[next_state] * (not done))
        if np.max(np.abs(prev_V - V)) < theta:
            break
        prev_V = V.copy()
        step +=1
    return V

In [571]:
V = policy_evaluation(pi, P)
print_state_value_function(V, P, n_cols=7, prec=5)

State-value function:
| 00 0.00046 | 01 0.00275 | 02 0.01099 | 03 0.03571 | 04 0.10989 | 05 0.33242 | 06 0.16621 |


# Policy Improvement

In [572]:
def policy_improvement(V, P, gamma=1.0):
    Q = np.zeros((len(P), len(P[0])), dtype=np.float64)
    for s in range(len(P)):
        for a in range(len(P[s])):
            for prop, next_state, reward, done in P[s][a]:
                Q[s][a] += prop * (reward + gamma * V[next_state] * (not done))
    new_pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    return new_pi

In [573]:
improved_pi = policy_improvement(V, P)
print_policy(pi, P, action_symbols=('<', '>'), n_cols=7)
print_policy_evaluation(env, improved_pi, P)

policy:
| 00      < | 01      < | 02      < | 03      < | 04      < | 05      < | 06      < |
Reaches goal 0%. Obtains an average undiscounted return of 0.9300.


In [574]:
# how about we evaluate the improved policy?
improved_V = policy_evaluation(improved_pi, P)
print_state_value_function(improved_V, P, n_cols=7, prec=5)

State-value function:
| 00 0.33379 | 01 0.66758 | 02 0.89011 | 03 0.96429 | 04 0.98901 | 05 0.99725 | 06 0.49863 |


In [575]:
improved_improved_pi = policy_improvement(improved_V, P)
print_policy(pi, P, action_symbols=('<', '>'), n_cols=7)
print_policy_evaluation(env, improved_improved_pi, P)

policy:
| 00      < | 01      < | 02      < | 03      < | 04      < | 05      < | 06      < |
Reaches goal 0%. Obtains an average undiscounted return of 0.9700.


In [576]:
# it is the same policy
# if we evaluate again, we can see there is nothing to improve 
# that also means we reached the optimal policy
improved_improved_V = policy_evaluation(improved_improved_pi, P)
print_state_value_function(improved_improved_V, P, n_cols=7, prec=5)

State-value function:
| 00 0.33379 | 01 0.66758 | 02 0.89011 | 03 0.96429 | 04 0.98901 | 05 0.99725 | 06 0.49863 |


In [577]:
# state-value function didn't improve, then we reach the optimal policy
assert np.all(improved_V == improved_improved_V)

# Policy Iteration

In [578]:
actions = tuple(P[0].keys())
np.random.choice(actions, len(P))

array([0, 1, 0, 0, 0, 0, 0])

In [579]:
def policy_iteration(P, gamma=1.0, theta=1e-10):
    actions = tuple(P[0].keys())
    random_actions = np.random.choice(actions, len(P))
    pi = lambda s: {s: a for s, a in enumerate(random_actions)}[s]
    
    while True:
        old_pi = {s:pi(s) for s in range(len(P))}
        V = policy_evaluation(pi, P, gamma, theta)
        pi = policy_improvement(V, P, gamma)
        new_pi = {s:pi(s) for s in range(len(P))}
        # print_policy_evaluate(env, improved_pi, P)
        if np.all(old_pi == new_pi):
            break
    return V, pi

In [580]:
optimal_V, optimal_pi = policy_iteration(P)
print_policy_evaluation(env, optimal_pi, P)
print_state_value_function(optimal_V, P, n_cols=7, prec=5)

Reaches goal 0%. Obtains an average undiscounted return of 0.9700.
State-value function:
| 00 0.33379 | 01 0.66758 | 02 0.89011 | 03 0.96429 | 04 0.98901 | 05 0.99725 | 06 0.49863 |


In [581]:
assert np.all(improved_V == optimal_V)

# Frozen Lake MDP and sample policies

In [582]:
env = gym.make('FrozenLake-v1')
P = env.unwrapped.P
init_state = env.reset()
goal_state = 15

LEFT, DOWN, RIGHT, UP = range(4)
random_pi = lambda s: {
    0:RIGHT, 1:LEFT, 2:DOWN, 3:UP,
    4:LEFT, 5:LEFT, 6:RIGHT, 7:LEFT,
    8:UP, 9:DOWN, 10:UP, 11:LEFT,
    12:LEFT, 13:RIGHT, 14:DOWN, 15:LEFT
}[s]

print_policy(random_pi, P)
print_policy_evaluation(env, random_pi, goal_state)

policy:
| 00      > | 01      < | 02      v | 03      ^ |
| 04      < |           | 06      > |           |
| 08      ^ | 09      v | 10      ^ |           |
|           | 13      > | 14      v |           |
Reaches goal 12%. Obtains an average undiscounted return of 0.0800.


In [584]:
careful_pi = lambda s: {
    0:LEFT, 1:UP, 2:UP, 3:UP,
    4:LEFT, 5:LEFT, 6:UP, 7:LEFT,
    8:UP, 9:DOWN, 10:LEFT, 11:LEFT,
    12:LEFT, 13:RIGHT, 14:RIGHT, 15:LEFT
}[s]

print_policy(careful_pi, P)
print_policy_evaluation(env, careful_pi, goal_state)

policy:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      ^ |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      > |           |
Reaches goal 53%. Obtains an average undiscounted return of 0.6600.


# Policy Evaluation

In [585]:
V = policy_evaluation(careful_pi, P, gamma=0.99)
print_state_value_function(V, P, prec=4)

State-value function:
| 00 0.4079 | 01 0.3754 | 02 0.3543 | 03 0.3438 |
| 04 0.4203 |           | 06 0.1169 |           |
| 08 0.4454 | 09  0.484 | 10 0.4328 |           |
|           | 13 0.5884 | 14 0.7107 |           |


# Policy Improvement

In [607]:
careful_plus_pi = policy_improvement(V, P, gamma=0.99)
print_policy(careful_pi, P)
print_policy_evaluation(env=env, pi=careful_pi, goal_state=goal_state)

policy:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      ^ |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      > |           |
Reaches goal 60%. Obtains an average undiscounted return of 0.5100.


In [608]:
new_V = policy_evaluation(careful_plus_pi, P, gamma=0.99)
print_state_value_function(new_V, P, prec=4)

State-value function:
| 00  0.542 | 01 0.4988 | 02 0.4707 | 03 0.4569 |
| 04 0.5585 |           | 06 0.3583 |           |
| 08 0.5918 | 09 0.6431 | 10 0.6152 |           |
|           | 13 0.7417 | 14 0.8628 |           |


In [609]:
print_state_value_function(new_V - V, P, prec=4)

State-value function:
| 00 0.1341 | 01 0.1234 | 02 0.1164 | 03  0.113 |
| 04 0.1381 |           | 06 0.2414 |           |
| 08 0.1464 | 09 0.1591 | 10 0.1824 |           |
|           | 13 0.1533 | 14 0.1521 |           |


# Alternating between evaluation and improvement

In [611]:
adversarial_pi = lambda s: {
    0:UP, 1:UP, 2:UP, 3:UP,
    4:UP, 5:LEFT, 6:UP, 7:LEFT,
    8:LEFT, 9:LEFT, 10:LEFT, 11:LEFT,
    12:LEFT, 13:LEFT, 14:LEFT, 15:LEFT
}[s]
print_policy(adversarial_pi, P)
print_policy_evaluation(env=env, pi=adversarial_pi, goal_state=goal_state)

policy:
| 00      ^ | 01      ^ | 02      ^ | 03      ^ |
| 04      ^ |           | 06      ^ |           |
| 08      < | 09      < | 10      < |           |
|           | 13      < | 14      < |           |
Reaches goal 0%. Obtains an average undiscounted return of 0.0000.


In [612]:
V = policy_evaluation(adversarial_pi, P, gamma=0.99)
print_state_value_function(V, P, prec=2)

State-value function:
| 00    0.0 | 01    0.0 | 02    0.0 | 03    0.0 |
| 04    0.0 |           | 06    0.0 |           |
| 08    0.0 | 09    0.0 | 10    0.0 |           |
|           | 13    0.0 | 14    0.0 |           |


In [615]:
i_pi = policy_improvement(V, P, gamma=0.99)
print_policy(i_pi, P)
print_policy_evaluation(env=env, pi=i_pi, goal_state=goal_state)

policy:
| 00      < | 01      < | 02      < | 03      < |
| 04      < |           | 06      < |           |
| 08      < | 09      < | 10      < |           |
|           | 13      < | 14      v |           |
Reaches goal 0%. Obtains an average undiscounted return of 0.0000.


In [616]:
i_V = policy_evaluation(i_pi, P, gamma=0.99)
print_state_value_function(i_V, P, prec=2)

State-value function:
| 00    0.0 | 01    0.0 | 02   0.04 | 03   0.02 |
| 04    0.0 |           | 06   0.07 |           |
| 08    0.0 | 09    0.0 | 10   0.19 |           |
|           | 13    0.0 | 14    0.5 |           |


In [617]:
ii_pi = policy_improvement(i_V, P, gamma=0.99)
print_policy(ii_pi, P)
print_policy_evaluation(env=env, pi=ii_pi, goal_state=goal_state)

policy:
| 00      < | 01      v | 02      > | 03      ^ |
| 04      < |           | 06      < |           |
| 08      < | 09      v | 10      < |           |
|           | 13      v | 14      > |           |
Reaches goal 0%. Obtains an average undiscounted return of 0.0000.


In [619]:
ii_V = policy_evaluation(ii_pi, P, gamma=0.99)
print_state_value_function(ii_V, P, prec=2)

State-value function:
| 00    0.0 | 01   0.05 | 02   0.16 | 03   0.15 |
| 04    0.0 |           | 06   0.17 |           |
| 08    0.0 | 09   0.22 | 10   0.35 |           |
|           | 13   0.33 | 14   0.67 |           |


In [622]:
iii_pi = policy_improvement(ii_V, P, gamma=0.99)
print_policy(iii_pi, P)
print_policy_evaluation(env=env, pi=iii_pi, goal_state=goal_state)

policy:
| 00      v | 01      > | 02      > | 03      ^ |
| 04      < |           | 06      < |           |
| 08      v | 09      v | 10      < |           |
|           | 13      > | 14      > |           |
Reaches goal 16%. Obtains an average undiscounted return of 0.1300.


In [623]:
iii_V = policy_evaluation(iii_pi, P, gamma=0.99)
print_state_value_function(iii_V, P, prec=2)

State-value function:
| 00   0.12 | 01   0.09 | 02   0.19 | 03   0.19 |
| 04   0.15 |           | 06    0.2 |           |
| 08   0.19 | 09   0.38 | 10   0.43 |           |
|           | 13   0.53 | 14   0.71 |           |


In [624]:
iiii_pi = policy_improvement(iii_V, P, gamma=0.99)
print_policy(iiii_pi, P)
print_policy_evaluation(env=env, pi=iiii_pi, goal_state=goal_state)

policy:
| 00      < | 01      ^ | 02      > | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 75%. Obtains an average undiscounted return of 0.6900.


In [625]:
iiii_V = policy_evaluation(iiii_pi, P, gamma=0.99)
print_state_value_function(iiii_V, P, prec=2)

State-value function:
| 00   0.52 | 01   0.38 | 02   0.26 | 03   0.25 |
| 04   0.54 |           | 06   0.28 |           |
| 08   0.57 | 09   0.62 | 10   0.58 |           |
|           | 13   0.72 | 14   0.85 |           |


In [626]:
iiiii_pi = policy_improvement(iiii_V, P, gamma=0.99)
print_policy(iiiii_pi, P)
print_policy_evaluation(env=env, pi=iiiii_pi, goal_state=goal_state)

policy:
| 00      < | 01      ^ | 02      < | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 75%. Obtains an average undiscounted return of 0.7500.


In [627]:
iiiii_V = policy_evaluation(iiii_pi, P, gamma=0.99)
print_state_value_function(iiii_V, P, prec=2)

State-value function:
| 00   0.52 | 01   0.38 | 02   0.26 | 03   0.25 |
| 04   0.54 |           | 06   0.28 |           |
| 08   0.57 | 09   0.62 | 10   0.58 |           |
|           | 13   0.72 | 14   0.85 |           |


In [628]:
iiiiii_pi = policy_improvement(iiii_V, P, gamma=0.99)
print_policy(iiiiii_pi, P)
print_policy_evaluation(env=env, pi=iiiiii_pi, goal_state=goal_state)

policy:
| 00      < | 01      ^ | 02      < | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 73%. Obtains an average undiscounted return of 0.8200.


In [630]:
iiiiii_V = policy_evaluation(iiiiii_pi, P, gamma=0.99)
print_state_value_function(iiiiii_V, P, prec=2)

State-value function:
| 00   0.53 | 01   0.45 | 02   0.38 | 03   0.37 |
| 04   0.55 |           | 06   0.32 |           |
| 08   0.58 | 09   0.63 | 10    0.6 |           |
|           | 13   0.73 | 14   0.86 |           |


In [631]:
iiiiiii_pi = policy_improvement(iiiiii_V, P, gamma=0.99)
print_policy(iiiiiii_pi, P)
print_policy_evaluation(env=env, pi=iiiiiii_pi, goal_state=goal_state)

policy:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 76%. Obtains an average undiscounted return of 0.8800.


# Policy Iteration

In [632]:
V_best_p, pi_best_p = policy_iteration(P, gamma=0.99)
print_state_value_function(V_best_p, P, prec=4)
print()
print('Optimal policy and state-value function (PI):')
print_policy(pi_best_p, P)
print_policy_evaluation(env=env, pi=pi_best_p, goal_state=goal_state)

State-value function:
| 00  0.542 | 01 0.4988 | 02 0.4707 | 03 0.4569 |
| 04 0.5585 |           | 06 0.3583 |           |
| 08 0.5918 | 09 0.6431 | 10 0.6152 |           |
|           | 13 0.7417 | 14 0.8628 |           |

Optimal policy and state-value function (PI):
policy:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 81%. Obtains an average undiscounted return of 0.7800.


# Slippery Walk Five

In [659]:
env = gym.make('SlipperyWalkFive-v0')
init_state = env.reset()
goal_state = 6
P = env.unwrapped.P

# Value Iteration

In [660]:
def value_iteration(P, gamma=1.0, theta=1e-10):
    V = np.zeros(len(P), dtype=np.float64())
    while True:
        Q = np.zeros((len(P), len(P[0])), dtype=np.float64)
        for s in range(len(P)):
            for a in range(len(P[s])):
                for prop, next_state, reward, done in P[s][a]:
                    next_state_value = gamma * V[next_state] * (not done)
                    Q[s, a] += prop * (reward + next_state_value)
        
        new_V = np.max(Q, axis=1)
        if np.max(np.abs(V - new_V)) < theta:
            break
        V = new_V
    
    pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    return V, pi

In [661]:
optimal_V, optimal_pi = value_iteration(P)
print('Optimal policy and state-value function (PI):')
print_policy(optimal_pi, P, action_symbols=('<', '>'), n_cols=7)
print_policy_evaluation(env, optimal_pi, goal_state)
print()
print_state_value_function(optimal_V, P, n_cols=7, prec=5)

Optimal policy and state-value function (PI):
policy:
| 00      > | 01      > | 02      > | 03      > | 04      > | 05      > | 06      < |
Reaches goal 97%. Obtains an average undiscounted return of 0.9800.

State-value function:
| 00 0.33379 | 01 0.66758 | 02 0.89011 | 03 0.96429 | 04 0.98901 | 05 0.99725 | 06 0.49863 |


# Frozen Lake MDP

In [655]:
env = gym.make('FrozenLake-v1')
init_state = env.reset()
goal_state = 15
P = env.unwrapped.P

In [656]:
V_best_v, pi_best_v = value_iteration(P, gamma=0.99)
print('Optimal policy and state-value function (VI):')
print_policy(pi_best_v, P)
print_policy_evaluation(env, pi_best_v, goal_state)
print()
print_state_value_function(V_best_v, P, prec=4)

Optimal policy and state-value function (VI):
policy:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 82%. Obtains an average undiscounted return of 0.8000.

State-value function:
| 00  0.542 | 01 0.4988 | 02 0.4707 | 03 0.4569 |
| 04 0.5585 |           | 06 0.3583 |           |
| 08 0.5918 | 09 0.6431 | 10 0.6152 |           |
|           | 13 0.7417 | 14 0.8628 |           |


In [657]:
print('For comparison, optimal policy and state-value function (PI):')
print_policy(pi_best_p, P)
print_policy_evaluation(env, pi_best_p, goal_state)
print()
print_state_value_function(V_best_p, P)

For comparison, optimal policy and state-value function (PI):
policy:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 78%. Obtains an average undiscounted return of 0.7500.

State-value function:
| 00  0.542 | 01  0.499 | 02  0.471 | 03  0.457 |
| 04  0.558 |           | 06  0.358 |           |
| 08  0.592 | 09  0.643 | 10  0.615 |           |
|           | 13  0.742 | 14  0.863 |           |


# Changing the Frozen Lake environment MDP

In [662]:
env = gym.make('FrozenLake-v1')
P = env.unwrapped.P
P

{0: {0: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  2: [(0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)],
  3: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False)],
  2: [(0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  3: [(0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 2:

In [663]:
env = gym.make('FrozenLake-v1')
P = env.unwrapped.P

# change reward function
reward_goal, reward_holes, reward_others = 1, -1, -0.01
goal, hole = 15, [5, 7, 11, 12]
for s in range(len(P)):
    for a in range(len(P[s])):
        for t in range(len(P[s][a])):
            values = list(P[s][a][t])
            if values[1] == goal:
                values[2] = reward_goal
                values[3] = False
            elif values[1] in hole:
                values[2] = reward_holes
                values[3] = False
            else:
                values[2] = reward_others
                values[3] = False
            if s in hole or s == goal:
                values[2] = 0
                values[3] = True
            P[s][a][t] = tuple(values)

# change transition function
prob_action, prob_drift_one, prob_drift_two = 0.8, 0.1, 0.1
for s in range(len(P)):
    for a in range(len(P[s])):
        for t in range(len(P[s][a])):
            if P[s][a][t][0] == 1.0:
                continue
            values = list(P[s][a][t])
            if t == 0:
                values[0] = prob_drift_one
            elif t == 1:
                values[0] = prob_action
            elif t == 2:
                values[0] = prob_drift_two
            P[s][a][t] = tuple(values)

env.env.P = P

In [664]:
V_best, pi_best = policy_iteration(env.env.P, gamma=0.99)
print('Optimal policy and state-value function (PI):')
print_policy(pi_best, P)
print_policy_evaluation(env, pi_best, goal_state)
print()
print_state_value_function(V_best, P)

Optimal policy and state-value function (PI):
policy:
| 00      v | 01      ^ | 02      v | 03      ^ |
| 04      < |           | 06      v |           |
| 08      > | 09      v | 10      < |           |
|           | 13      > | 14      > |           |
Reaches goal 0%. Obtains an average undiscounted return of 0.5293.

State-value function:
| 00  0.433 | 01  0.353 | 02  0.409 | 03   0.28 |
| 04  0.461 |           | 06   0.45 |           |
| 08  0.636 | 09  0.884 | 10  0.831 |           |
|           | 13  0.945 | 14  0.977 |           |


In [665]:
V_best, pi_best = value_iteration(env.env.P, gamma=0.99)
print('Optimal policy and state-value function (PI):')
print_policy(pi_best, P)
print_policy_evaluation(env, pi_best, goal_state)
print()
print_state_value_function(V_best, P)

Optimal policy and state-value function (PI):
policy:
| 00      v | 01      ^ | 02      v | 03      ^ |
| 04      < |           | 06      v |           |
| 08      > | 09      v | 10      < |           |
|           | 13      > | 14      > |           |
Reaches goal 0%. Obtains an average undiscounted return of 0.5719.

State-value function:
| 00  0.433 | 01  0.353 | 02  0.409 | 03   0.28 |
| 04  0.461 |           | 06   0.45 |           |
| 08  0.636 | 09  0.884 | 10  0.831 |           |
|           | 13  0.945 | 14  0.977 |           |


# Russell & Norvig's Gridworld

In [668]:
from gymnasium.envs.registration import register

register(
    id='RussellNorvigGridworld-v0',
    # To reproduce must use gamma of 1.0 (no discount)
    entry_point='gym_aima.envs:AIMAEnv',
    # With or without the sink state it will work
    # because they don't use discounting
    kwargs={'noise': 0.2, 'living_rew': -0.04, 'sink': False},
    max_episode_steps=100,
    reward_threshold=1.0,
    nondeterministic=True,
)

In [667]:
env = gym.make('RussellNorvigGridworld-v0')
init_state = env.reset()
goal_state = 3
P = env.unwrapped.P

ImportError: cannot import name 'discrete' from 'gym.envs.toy_text' (/home/bakr/miniconda3/envs/torch/lib/python3.11/site-packages/gym/envs/toy_text/__init__.py)