In [255]:
pip install git+https://github.com/mimoralea/gym-walk#egg=gym-walk

Collecting gym-walk
  Cloning https://github.com/mimoralea/gym-walk to /tmp/pip-install-asxspuyu/gym-walk_9daef8af646044498bc6ab60eb0c9d92
  Running command git clone --filter=blob:none --quiet https://github.com/mimoralea/gym-walk /tmp/pip-install-asxspuyu/gym-walk_9daef8af646044498bc6ab60eb0c9d92
  Resolved https://github.com/mimoralea/gym-walk to commit b915b94cf2ad16f8833a1ad92ea94e88159279f5
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [30]:
import warnings ; warnings.filterwarnings('ignore')

import gym, gym_walk
import numpy as np

import random
import warnings

warnings.filterwarnings('ignore', category=DeprecationWarning)
np.set_printoptions(suppress=True)
random.seed(123); np.random.seed(123)


In [31]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^'), n_cols=4, title='Policy:'):
    print(title)
    arrs = {k: v for k, v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi(s)
        print("| ", end="")
        # If all possible transitions from state s are terminal, leave blank
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0:
            print("|")

In [32]:
def print_state_value_function(V, P, n_cols=4, prec=3, title='State-value function:'):
    print(title)
    for s in range(len(P)):
        v = V[s]
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), f"{np.round(v, prec)}".rjust(6), end=" ")
        if (s + 1) % n_cols == 0:
            print("|")

In [33]:
def probability_success(env, pi, goal_state, n_episodes=100, max_steps=200):
    successes = 0
    # Seed action space for reproducibility if sampling from it
    env.action_space.seed(123)

    for episode in range(n_episodes):
        # Seed the environment at each reset
        reset_return = env.reset(seed=123 + episode)
        if isinstance(reset_return, tuple):
            state, _ = reset_return
        else:
            state = reset_return

        done = False
        steps = 0

        while not done and steps < max_steps:
            nxt_state, _, terminated, truncated, _ = env.step(pi(state))
            done = terminated or truncated
            state = nxt_state
            steps += 1

        if state == goal_state:
            successes += 1

    return successes / n_episodes

In [34]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    returns = []
    env.action_space.seed(123)

    for episode in range(n_episodes):
        reset_return = env.reset(seed=123 + episode)
        if isinstance(reset_return, tuple):
            state, _ = reset_return
        else:
            state = reset_return

        done = False
        steps = 0
        G = 0.0

        while not done and steps < max_steps:
            nxt_state, reward, terminated, truncated, _ = env.step(pi(state))
            done = terminated or truncated
            G += reward
            state = nxt_state
            steps += 1

        returns.append(G)

    return np.mean(returns)

# Creating the Frozen Lake environment

In [35]:
envdesc = [
    "SFFF",
    "FHFH",
    "FFHF",
    "GFFH"
]
env = gym.make('FrozenLake-v1',desc=envdesc)
init_state = env.reset()
goal_state = 12 #Enter the Goal state
P = env.env.P

In [36]:
P

{0: {0: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  2: [(0.3333333333333333, 4, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)],
  3: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True)],
  1: [(0.3333333333333333, 0, 0.0, False),
   (0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False)],
  2: [(0.3333333333333333, 5, 0.0, True),
   (0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False)],
  3: [(0.3333333333333333, 2, 0.0, False),
   (0.3333333333333333, 1, 0.0, False),
   (0.3333333333333333, 0, 0.0, False)]},
 2:

# Value Iteration Algorithm

In [37]:
def value_iteration(P, gamma=1.0, theta=1e-10):
    nS = len(P)
    nA = len(P[0])
    V = np.zeros(nS, dtype=np.float64)

    while True:
        Q = np.zeros((nS, nA), dtype=np.float64)
        for s in range(nS):
            for a in range(nA):
                for prob, next_state, reward, done in P[s][a]:
                    Q[s][a] += prob * (reward + gamma * V[next_state] * (not done))

        new_V = np.max(Q, axis=1)
        if np.max(np.abs(V - new_V)) < theta:
            break
        V = new_V.copy()

    # Construct deterministic policy: pick argmax_a Q[s,a]
    pi = lambda s: int(np.argmax(Q[s]))

    return V, pi

In [38]:
# Finding the optimal policy
V_best_v, pi_best_v = value_iteration(P, gamma=0.99)


In [39]:
# Printing the policy
print("Name: Yuvaraj S")
print("Register Number: 212222240119")
print('Optimal policy and state-value function (VI):')
print_policy(pi_best_v, P)

Name: Yuvaraj S
Register Number: 212222240119
Optimal policy and state-value function (VI):
Policy:
| 00      < | 01      ^ | 02      ^ | 03      ^ |
| 04      < |           | 06      < |           |
| 08      < | 09      < |           | 11      < |
|           | 13      < | 14      < |           |


In [40]:
# printing the success rate and the mean return
print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(probability_success(env, pi_best_v, goal_state=goal_state)*100,mean_return(env, pi_best_v)))

Reaches goal 100.00%. Obtains an average undiscounted return of 1.0000.


In [268]:
# printing the state value function
print_state_value_function(V_best_v, P, prec=4)

State-value function:
| 00 0.8514 | 01 0.7835 | 02 0.7393 | 03 0.7176 |
| 04 0.8772 |           | 06  0.244 |           |
| 08 0.9296 | 09 0.5623 |           | 11    0.0 |
|           | 13 0.7745 | 14 0.3815 |           |
